1 Sample: ZF Stages

This file produces additional analyses plots for sample ZF Stages, iteration of archR result.

1.1 Parameter settings for this report

params_df <- data.frame(param = names(params),
                        value = as.vector(unlist(params)))

knitr::kable(params_df)
param value
use_title (Improved) archR result analysis report for CTSS in Zebrafish
sample_name ZF Stages
iqw_order_by_median TRUE
data_path_on_disk
result_path_on_disk
result_dir_name
do_plot_seq_image TRUE
do_plot_arch TRUE
do_iqw_tpm_plots TRUE
do_bedfile_write TRUE

1.2 Plots for sections marked TRUE are included in this report.

sample_names <- c("64_cells", "dome_30perc_epiboly", "prim6")

archR_zf_best_run <- vector("list", length(sample_names))
names(archR_zf_best_run) <- sample_names

zf_2013_result <- vector("list", length(sample_names))
names(zf_2013_result) <- sample_names

zf_nepal2013_bed_info <- vector("list", length(sample_names))
names(zf_nepal2013_bed_info) <- sample_names

seqs_clusters_as_list <- vector("list", length(sample_names))
names(seqs_clusters_as_list) <- sample_names

seqs_clusters_as_list_ordered <- vector("list", length(sample_names))
names(seqs_clusters_as_list_ordered) <- sample_names

perSample_CAGEobj <- vector("list", length(sample_names))
names(perSample_CAGEobj) <- sample_names

perSample_peakAnno <- vector("list", length(sample_names))
names(perSample_peakAnno) <- sample_names

samarth_df <- vector("list", length(sample_names))
names(samarth_df) <- sample_names
##
phastCons_scores <- vector("list", length(sample_names))
names(phastCons_scores) <- sample_names

##
## lists storing plots
## 
## store iqw+tpm plots
perSample_pl <- vector("list", length(sample_names)) 
names(perSample_pl) <- sample_names

## store architectures' plots
perSample_arch <- vector("list", length(sample_names)) 
names(perSample_arch) <- sample_names

## store architectures' plots combined
perSample_arch_combined <- vector("list", length(sample_names)) 
names(perSample_arch_combined) <- sample_names


perSample_arch_posStrand <- vector("list", length(sample_names)) 
names(perSample_arch_posStrand) <- sample_names

perSample_arch_negStrand <- vector("list", length(sample_names)) 
names(perSample_arch_negStrand) <- sample_names

## store result_directory path
result_dir_path <- vector("list", length(sample_names)) 
names(result_dir_path) <- sample_names

## store go plots
perSample_go <- vector("list", length(sample_names)) 
names(perSample_go) <- sample_names


## store clusters
perSample_archR_clusts <- vector("list", length(sample_names)) 
names(perSample_archR_clusts) <- sample_names

## store overlaps info
perCluster_overlaps_perSample <- vector("list", length(sample_names)) 
names(perCluster_overlaps_perSample) <- sample_names


################################################################################
## Per section some variables need to be set. These are done here

# txdb <- TxDb.Dmelanogaster.UCSC.dm6.ensGene
## CAGEr object processing


# read TagClusters information from corresponding RDS file for sample
# All samples in one RDS object
cager_obj <- file.path("/mnt/storage/cage_datasets/",
                       "danio_rerio/danRer10/cager_objects",
            paste0("myCAGEset_Normalised_TagC_drerio_",
                   "danRer10_Nepal_processedMergedSamples.rds"))

ok_chr_names <- paste0("chr", 1:25)

for(sn in sample_names){
    result_dir_path[[sn]] <- file.path(archR_org_results_path, 
                                       paste0(sn, "_results"))
}
for(sn in sample_names){
    archR_zf_best_run[[sn]] <- 
    file.path(archR_org_results_path, 
        paste0("archR_result_zebrafish_nepal2012_", sn,
               "_modSelType_stability_chunkSize_1000_", 
               "bound_1e-06_collate_FTFFF"))
    if(file.exists(archR_zf_best_run[[sn]])){
    zf_2013_result[[sn]] <- readRDS(file.path(archR_zf_best_run[[sn]],
                                          "archRresult.rds"))
    }else{
    stop("Check if file exists. ", archR_zf_best_run[[sn]])
    }
    ## Bed Info -- IQW and dominant TPM values 
    bed_fname <- file.path(archR_org_data_path, paste0("samarth_zf_TC_sample_Drerio_", sn, "_flank_500.bed"))
    print(bed_fname)
    
    zf_nepal2013_bed_info[[sn]] <- read.delim(file = bed_fname,
                          sep = "\t", header = FALSE,
                          col.names = c("chr", "start", "end", "IQW", 
                                        "domTPM", "strand"))
}
## [1] "data/zebrafish-nepal2013/new/samarth_zf_TC_sample_Drerio_64_cells_flank_500.bed"
## [1] "data/zebrafish-nepal2013/new/samarth_zf_TC_sample_Drerio_dome_30perc_epiboly_flank_500.bed"
## [1] "data/zebrafish-nepal2013/new/samarth_zf_TC_sample_Drerio_prim6_flank_500.bed"
## SAMPLE: 64_cells
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/64_cells_results
## ℹ Using saved GRanges obj from CAGEr object
## >> preparing features information...      2023-01-09 16:32:06 
## >> identifying nearest features...        2023-01-09 16:32:07 
## >> calculating distance from peak to TSS...   2023-01-09 16:32:07 
## >> assigning genomic annotation...        2023-01-09 16:32:07 
## >> adding gene annotation...          2023-01-09 16:32:18
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths           2023-01-09 16:32:18 
## >> done...                    2023-01-09 16:32:18
## SAMPLE: dome_30perc_epiboly
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/dome_30perc_epiboly_results
## ℹ Using saved GRanges obj from CAGEr object
## >> preparing features information...      2023-01-09 16:32:18 
## >> identifying nearest features...        2023-01-09 16:32:18 
## >> calculating distance from peak to TSS...   2023-01-09 16:32:19 
## >> assigning genomic annotation...        2023-01-09 16:32:19 
## >> adding gene annotation...          2023-01-09 16:32:20
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths           2023-01-09 16:32:20 
## >> done...                    2023-01-09 16:32:20
## SAMPLE: prim6
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/prim6_results
## ℹ Using saved GRanges obj from CAGEr object
## >> preparing features information...      2023-01-09 16:32:20 
## >> identifying nearest features...        2023-01-09 16:32:20 
## >> calculating distance from peak to TSS...   2023-01-09 16:32:20 
## >> assigning genomic annotation...        2023-01-09 16:32:20 
## >> adding gene annotation...          2023-01-09 16:32:21
## 'select()' returned 1:many mapping between keys and columns
## >> assigning chromosome lengths           2023-01-09 16:32:22 
## >> done...                    2023-01-09 16:32:22

1.3 Order archR clusters by median IQ width

By default, the clusters are ordered by their median interquantile widths. One can choose to order by the mean interquantile widths by setting paramater iqw_order_by_median to FALSE.

1.3.1 Stage 64 cells


## Sample 1, 64 cells
sn <- sample_names[1]
itr <- 5
use_aggl <- 'ward.D'
use_dist <- 'cor'

iter5_clusts_reord <- archR::collate_archR_result(
  result = zf_2013_result[[sn]],
  iter = itr, clust_method = 'hc', aggl_method = use_aggl,
  dist_method = use_dist, regularize = FALSE, topn = 50,
  flag = list(debugFlag = FALSE, verboseFlag = TRUE), collate = FALSE,
  return_order = TRUE)


## Zebrafish result final clustering already uses cor + complete linkage
## So the result looks already good.

## these are in default archR ordering
clust_archR_ord_list <- archR::get_seqs_clust_list(
  seqs_clust_lab = zf_2013_result[[ sn ]]$seqsClustLabels[[itr]])

## these are now ordered by the hc ordering
clust_hc_ord_list <- lapply(iter5_clusts_reord$order, function(x){
  clust_archR_ord_list[[x]]
})


ordered_arch_pl <- archR::plot_arch_for_clusters(
  seqs = zf_2013_result[[ sn ]]$rawSeqs, 
  clust_list = clust_hc_ord_list, pos_lab = -45:150, 
  xt_freq = 5, set_titles = FALSE, show = FALSE, bits_yax = "auto")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the ggseqlogo package.
##   Please report the issue at <https://github.com/omarwagih/ggseqlogo/issues>.

ordered_arch_pl2 <- lapply(rev(ordered_arch_pl), function(pl){
  pl <- pl + 
        ggplot2::theme(axis.text = ggplot2::element_text(size = 0), 
                       axis.text.x = ggplot2::element_text(
                         angle = 0, vjust = 2, hjust = 0.5),
                       axis.text.y = ggplot2::element_text(vjust = 0.5),
                       axis.title.y = ggplot2::element_text(size = 0),
                       axis.ticks.length = ggplot2::unit(0.00, "cm"),
                       plot.margin = ggplot2::unit(c(-0.1,0,-0.4,-0.4), "cm"))
})
sam_foo <- cowplot::plot_grid(plotlist = ordered_arch_pl2, ncol = 1)

use_cutk <- 5

stopifnot(check_and_create_dir(result_dir_path[[sn]]))
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/64_cells_results
fname <- file.path(result_dir_path[[sn]], paste0(sn, "_dend_arch_list_", 
                    use_dist, "_", use_aggl, "_cut", use_cutk))

sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname, use_cutk = use_cutk,
              clusts = iter5_clusts_reord, use_ht = 30, plot_png = FALSE,
              lwd = 0.4, repel = TRUE, show_labels = TRUE, labels_track_height = 0.25,
              rect = TRUE, rect_fill = TRUE,
              color_labels_by_k = TRUE)
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan

# sam_foo2


###
temp_clusts <- cutree(iter5_clusts_reord, k = use_cutk)
names(temp_clusts) <- NULL
## Make further few clusters
nCl <- length(unique(temp_clusts))

temp_clusts[c(14,28,30,31)] <- nCl + 1


existing_clust <- sort(unique(temp_clusts))
for(i in seq_along(existing_clust)){
    idx <- which(temp_clusts == existing_clust[i])
    temp_clusts[idx] <- i
}

stopifnot(check_and_create_dir(result_dir_path[[sn]]))
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/64_cells_results
fname2 <- file.path(result_dir_path[[sn]], paste0(sn, "_dend_arch_list_",
                                           use_dist, "_", use_aggl, "_",
                                           use_cutk, "clusters_final"))

## Also need to show alongside, how the final clusters' seqlogos look
clust_list <- lapply(unique(temp_clusts), function(x){which(temp_clusts == x)})
seqs_clusters_as_list[[sn]] <- archR::collate_clusters(clust_list, archR::get_seqs_clust_list(zf_2013_result[[sn]]$seqsClustLabels[[itr]]))
##

use_color <- scales::hue_pal()(length(unique(temp_clusts)))
sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname2, use_ht = 60,
               use_cutk = use_cutk,#length(unique(temp_clusts)), 
               clusts = iter5_clusts_reord, rect = TRUE, rect_fill = TRUE,
               label_cols = use_color[temp_clusts[iter5_clusts_reord$order]],
               k_colors = use_color, 
               clust_assignment = clust_list,
               new_clusts = seqs_clusters_as_list[[sn]],
               rawSeqs = zf_2013_result[[sn]]$rawSeqs,
               palette = FALSE, plot_png = FALSE)
## Warning in get_col(col, k): Length of color vector was longer than the number of
## clusters - first k elements are used



# use_color <- scales::hue_pal()(length(unique(temp_clusts)))
# sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname2,
#                use_cutk = use_cutk,
#                #length(unique(temp_clusts)),
#                clusts = iter5_clusts_reord, rect = TRUE, rect_fill = TRUE,
#                label_cols = use_color[temp_clusts[iter5_clusts_reord$order]],
#                k_colors = use_color, plot_png = FALSE,
#                palette = FALSE)

##


# clust_list <- lapply(unique(temp_clusts), function(x){which(temp_clusts == x)})
# seqs_clusters_as_list[[sn]] <- archR::collate_clusters(clust_list, archR::get_seqs_clust_list(zf_2013_result[[sn]]$seqsClustLabels[[itr]]))
###


cluster_medians_IQW <- unlist(lapply(seqs_clusters_as_list[[sn]], function(x){
    median(zf_nepal2013_bed_info[[sn]]$IQW[x])
}))
cluster_means_IQW <- unlist(lapply(seqs_clusters_as_list[[sn]], function(x){
    mean(zf_nepal2013_bed_info[[sn]]$IQW[x])
}))

if(iqw_order_by_median){
    ascending_order_IQW <- sort(cluster_medians_IQW, decreasing = FALSE,
                            index.return = TRUE)
}else{
    ascending_order_IQW <- sort(cluster_means_IQW, decreasing = FALSE,
                            index.return = TRUE)
}
##
seqs_clusters_as_list_ordered[[sn]] <- 
                      lapply(ascending_order_IQW$ix,
                              function(x){
                                  seqs_clusters_as_list[[sn]][[x]]
                              })

perSample_archR_clusts[[sn]] <- seqs_clusters_as_list_ordered[[sn]]

1.3.2 Stage dome_30%_Epiboly


## Sample 2, dome_30perc_epiboly needs final clustering
sn <- sample_names[2]
itr <- 5
use_aggl <- 'ward.D'
use_dist <- 'cor'


# iter5_clusts <- archR::collate_archR_result(
#   result = zf_2013_result[[sample_names[1]]],
#   iter = itr, clust_method = 'hc', aggl_method = use_aggl,
#   dist_method = use_dist, regularize = TRUE, topn = 50,
#   flag = list(debugFlag = TRUE, verboseFlag = TRUE), collate = TRUE,
#   return_order = FALSE, minClusters = 4)

iter5_clusts_reord <- archR::collate_archR_result(
  result = zf_2013_result[[sn]],
  iter = itr, clust_method = 'hc', aggl_method = use_aggl,
  dist_method = use_dist, regularize = TRUE, topn = 50,
  flag = list(debugFlag = FALSE, verboseFlag = TRUE), collate = FALSE,
  return_order = TRUE)


# iter5_clusts_reord1 <- archR::collate_archR_result(
#   result = zf_2013_result[[sn]],
#   iter = itr, clust_method = 'hc', aggl_method = "ward.D",
#   dist_method = "euclid", regularize = TRUE, topn = 50,
#   flag = list(debugFlag = TRUE, verboseFlag = TRUE), collate = FALSE,
#   return_order = TRUE)
# 
# iter5_clusts_reord2 <- archR::collate_archR_result(
#   result = zf_2013_result[[sn]],
#   iter = itr, clust_method = 'hc', aggl_method = "complete",
#   dist_method = "cor", regularize = TRUE, topn = 50,
#   flag = list(debugFlag = TRUE, verboseFlag = TRUE), collate = FALSE,
#   return_order = TRUE)


## Zebrafish result final clustering already uses cor + complete linkage
## So the result looks already good.

## these are in default archR ordering
clust_archR_ord_list <- archR::get_seqs_clust_list(
  seqs_clust_lab = zf_2013_result[[ sn ]]$seqsClustLabels[[itr]])

## these are now ordered by the hc ordering
clust_hc_ord_list <- lapply(iter5_clusts_reord$order, function(x){
  clust_archR_ord_list[[x]]
})


ordered_arch_pl <- archR::plot_arch_for_clusters(
  seqs = zf_2013_result[[ sn ]]$rawSeqs, 
  clust_list = clust_hc_ord_list, pos_lab = -45:150, 
  xt_freq = 5, set_titles = FALSE, show = FALSE, bits_yax = "auto")

ordered_arch_pl2 <- lapply(rev(ordered_arch_pl), function(pl){
  pl <- pl + 
        ggplot2::theme(axis.text = ggplot2::element_text(size = 0), 
                       axis.text.x = ggplot2::element_text(
                         angle = 0, vjust = 2, hjust = 0.5),
                       axis.text.y = ggplot2::element_text(vjust = 0.5),
                       axis.title.y = ggplot2::element_text(size = 0),
                       axis.ticks.length = ggplot2::unit(0.00, "cm"),
                       plot.margin = ggplot2::unit(c(-0.1,0,-0.4,-0.4), "cm"))
})
sam_foo <- cowplot::plot_grid(plotlist = ordered_arch_pl2, ncol = 1)

use_cutk <- 12

stopifnot(check_and_create_dir(result_dir_path[[sn]]))
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/dome_30perc_epiboly_results
fname <- file.path(result_dir_path[[sn]], paste0(sn, "_dend_arch_list_", 
                    use_dist, "_", use_aggl, "_cut", use_cutk))

sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname, use_cutk = use_cutk,
              clusts = iter5_clusts_reord, use_ht = 30, plot_png = FALSE,
              lwd = 0.4, repel = TRUE, show_labels = TRUE, labels_track_height = 0.25,
              rect = TRUE, rect_fill = TRUE,
              color_labels_by_k = TRUE)

# sam_foo2


###
temp_clusts <- cutree(iter5_clusts_reord, k = use_cutk)
names(temp_clusts) <- NULL
## Make further few clusters
nCl <- length(unique(temp_clusts))

temp_clusts[c(29)] <- temp_clusts[c(22)]


existing_clust <- sort(unique(temp_clusts))
for(i in seq_along(existing_clust)){
    idx <- which(temp_clusts == existing_clust[i])
    temp_clusts[idx] <- i
}

stopifnot(check_and_create_dir(result_dir_path[[sn]]))
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/dome_30perc_epiboly_results
fname2 <- file.path(result_dir_path[[sn]], paste0(sn, "_dend_arch_list_",
                                           use_dist, "_", use_aggl, "_",
                                           use_cutk, "clusters_final"))

## Also need to show alongside, how the final clusters' seqlogos look
clust_list <- lapply(unique(temp_clusts), function(x){which(temp_clusts == x)})
seqs_clusters_as_list[[sn]] <- archR::collate_clusters(clust_list, archR::get_seqs_clust_list(zf_2013_result[[sn]]$seqsClustLabels[[itr]]))
##

use_color <- scales::hue_pal()(length(unique(temp_clusts)))
sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname2, use_ht = 60,
               use_cutk = use_cutk,#length(unique(temp_clusts)), 
               clusts = iter5_clusts_reord, rect = TRUE, rect_fill = TRUE,
               label_cols = use_color[temp_clusts[iter5_clusts_reord$order]],
               k_colors = use_color, 
               clust_assignment = clust_list,
               new_clusts = seqs_clusters_as_list[[sn]],
               rawSeqs = zf_2013_result[[sn]]$rawSeqs,
               palette = FALSE, plot_png = FALSE)
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled


# use_color <- scales::hue_pal()(length(unique(temp_clusts)))
# sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname2,
#                use_cutk = use_cutk,
#                #length(unique(temp_clusts)),
#                clusts = iter5_clusts_reord, rect = TRUE, rect_fill = TRUE,
#                label_cols = use_color[temp_clusts[iter5_clusts_reord$order]],
#                k_colors = use_color, plot_png = FALSE,
#                palette = FALSE)
# 
# 
# clust_list <- lapply(unique(temp_clusts), function(x){which(temp_clusts == x)})
# seqs_clusters_as_list[[sn]] <- archR::collate_clusters(clust_list, archR::get_seqs_clust_list(zf_2013_result[[ sn ]]$seqsClustLabels[[itr]]))
###


cluster_medians_IQW <- unlist(lapply(seqs_clusters_as_list[[sn]], function(x){
    median(zf_nepal2013_bed_info[[sn]]$IQW[x])
}))
cluster_means_IQW <- unlist(lapply(seqs_clusters_as_list[[sn]], function(x){
    mean(zf_nepal2013_bed_info[[sn]]$IQW[x])
}))

if(iqw_order_by_median){
    ascending_order_IQW <- sort(cluster_medians_IQW, decreasing = FALSE,
                            index.return = TRUE)
}else{
    ascending_order_IQW <- sort(cluster_means_IQW, decreasing = FALSE,
                            index.return = TRUE)
}
##
seqs_clusters_as_list_ordered[[sn]] <- 
                      lapply(ascending_order_IQW$ix,
                              function(x){
                                  seqs_clusters_as_list[[sn]][[x]]
                              })

perSample_archR_clusts[[sn]] <- seqs_clusters_as_list_ordered[[sn]]

1.3.3 Stage prim6


sn <- sample_names[3]
itr <- 5
use_aggl <- 'ward.D' 
use_dist <- 'cor'


iter5_clusts_reord <- archR::collate_archR_result(
  result = zf_2013_result[[sn]], 
  iter = itr, clust_method = 'hc', aggl_method = use_aggl, 
  dist_method = use_dist, regularize = FALSE, topn = 50, 
  flag = list(debugFlag = FALSE, verboseFlag = TRUE), collate = FALSE, 
  return_order = TRUE)

clust_archR_ord_list <- archR::get_seqs_clust_list(
  seqs_clust_lab = zf_2013_result[[ sn ]]$seqsClustLabels[[itr]])

## these are now ordered by the hc ordering
clust_hc_ord_list <- lapply(iter5_clusts_reord$order, function(x){
  clust_archR_ord_list[[x]]
})

ordered_arch_pl <- archR::plot_arch_for_clusters(
  seqs = zf_2013_result[[ sn ]]$rawSeqs, 
  clust_list = clust_hc_ord_list, pos_lab = -45:150, 
  xt_freq = 5, set_titles = FALSE, show = FALSE, bits_yax = "auto")

ordered_arch_pl2 <- lapply(rev(ordered_arch_pl), function(pl){
  pl <- pl + 
        ggplot2::theme(axis.text = ggplot2::element_text(size = 0), 
                       axis.text.x = ggplot2::element_text(
                         angle = 0, vjust = 2, hjust = 0.5),
                       axis.text.y = ggplot2::element_text(vjust = 0.5),
                       axis.title.y = ggplot2::element_text(size = 0),
                       axis.ticks.length = ggplot2::unit(0.00, "cm"),
                       plot.margin = ggplot2::unit(c(-0.1,0,-0.4,-0.4), "cm"))
})
sam_foo <- cowplot::plot_grid(plotlist = ordered_arch_pl2, ncol = 1)

use_cutk <- 12

stopifnot(check_and_create_dir(result_dir_path[[sn]]))
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/prim6_results
fname <- file.path(result_dir_path[[sn]], paste0(sn, "_dend_arch_list_", 
                    use_dist, "_", use_aggl, "_cut", use_cutk))

sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname, use_cutk = use_cutk,
              clusts = iter5_clusts_reord, use_ht = 30, plot_png = FALSE,
              lwd = 0.4, repel = TRUE, show_labels = TRUE, labels_track_height = 0.25,
              rect = TRUE, rect_fill = TRUE,
              color_labels_by_k = TRUE)

# sam_foo2


###
temp_clusts <- cutree(iter5_clusts_reord, k = use_cutk)
names(temp_clusts) <- NULL
## Make further few clusters
nCl <- length(unique(temp_clusts))

temp_clusts[c(21,2,29)] <- temp_clusts[c(19)]
temp_clusts[c(8)] <- temp_clusts[c(11)]

existing_clust <- sort(unique(temp_clusts))
for(i in seq_along(existing_clust)){
    idx <- which(temp_clusts == existing_clust[i])
    temp_clusts[idx] <- i
}

stopifnot(check_and_create_dir(result_dir_path[[sn]]))
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/prim6_results
fname2 <- file.path(result_dir_path[[sn]], paste0(sn, "_dend_arch_list_",
                                           use_dist, "_", use_aggl, "_",
                                           use_cutk, "clusters_final"))

## Also need to show alongside, how the final clusters' seqlogos look
clust_list <- lapply(unique(temp_clusts), function(x){which(temp_clusts == x)})
seqs_clusters_as_list[[sn]] <- archR::collate_clusters(clust_list, archR::get_seqs_clust_list(zf_2013_result[[sn]]$seqsClustLabels[[itr]]))
##

use_color <- scales::hue_pal()(length(unique(temp_clusts)))
sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname2, use_ht = 60,
               use_cutk = use_cutk,#length(unique(temp_clusts)), 
               clusts = iter5_clusts_reord, rect = TRUE, rect_fill = TRUE,
               label_cols = use_color[temp_clusts[iter5_clusts_reord$order]],
               k_colors = use_color, 
               clust_assignment = clust_list,
               new_clusts = seqs_clusters_as_list[[sn]],
               rawSeqs = zf_2013_result[[sn]]$rawSeqs,
               palette = FALSE, plot_png = FALSE)
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled


# use_color <- scales::hue_pal()(length(unique(temp_clusts)))
# sam_foo2 <- plot_dend_arch(arch_plot = sam_foo, fname = fname2,
#                use_cutk = use_cutk,
#                #length(unique(temp_clusts)),
#                clusts = iter5_clusts_reord, rect = TRUE, rect_fill = TRUE,
#                label_cols = use_color[temp_clusts[iter5_clusts_reord$order]],
#                k_colors = use_color, plot_png = FALSE,
#                palette = FALSE)
# 
# 
# clust_list <- lapply(unique(temp_clusts), function(x){which(temp_clusts == x)})
# seqs_clusters_as_list[[sn]] <- archR::collate_clusters(clust_list, archR::get_seqs_clust_list(zf_2013_result[[ sn ]]$seqsClustLabels[[itr]]))
###

###

cluster_medians_IQW <- unlist(lapply(seqs_clusters_as_list[[sn]], function(x){
    median(zf_nepal2013_bed_info[[sn]]$IQW[x])
}))
cluster_means_IQW <- unlist(lapply(seqs_clusters_as_list[[sn]], function(x){
    mean(zf_nepal2013_bed_info[[sn]]$IQW[x])
}))

if(iqw_order_by_median){
    ascending_order_IQW <- sort(cluster_medians_IQW, decreasing = FALSE,
                            index.return = TRUE)
}else{
    ascending_order_IQW <- sort(cluster_means_IQW, decreasing = FALSE,
                            index.return = TRUE)
}
##
seqs_clusters_as_list_ordered[[sn]] <- 
                      lapply(ascending_order_IQW$ix,
                              function(x){
                                  seqs_clusters_as_list[[sn]][[x]]
                              })

perSample_archR_clusts[[sn]] <- seqs_clusters_as_list_ordered[[sn]]
## Display as carousel
fnames <- unlist(lapply(sample_names, function(x){
  respath <- file.path(archR_org_results_path, paste0(x, "_results"))
  flist <- list.files(respath, pattern = paste0(x, "_dend_arch_list*"))
  last3 <- unlist(lapply(flist, function(y){
    substr(y, nchar(y)-2, nchar(y))
    }))
  idx <- which(last3 == "png")
  # stopifnot(length(flist[idx]) == 1)
  file.path(respath, flist[idx])
  }))



cP1 <- htmlwidgets::JS("function(slick,index) {
                        return '<a>'+(index+1)+'</a>';
                   }")

slick_clustering_imgs <- slickR::slickR(obj = fnames, slideId = "clusterings", 
  slideType = "img", width="100%", height = "800px") + 
  slickR::settings(initialSlide = 0, 
          infinite = FALSE,
          focusOnSelect = TRUE,
          dots = TRUE,
          customPaging = cP1)
    

slick_clustering_imgs
for(sn in sample_names) {
    clust_lens <- unlist(lapply(seqs_clusters_as_list_ordered[[sn]], length))
    message("Ordered list lengths: ", paste(clust_lens, collapse = " "))
    message("Original list lengths:", 
      paste(unlist(lapply(seqs_clusters_as_list[[sn]], length)), collapse = " "))
  
  
    ##
    clust_lab <- rep("0", length(zf_2013_result[[sn]]$rawSeqs))
    clust_names <- sort(as.character(1:length(seqs_clusters_as_list_ordered[[sn]])))
    for(i in seq_along(seqs_clusters_as_list_ordered[[sn]])){
        clust_lab[seqs_clusters_as_list_ordered[[sn]][[i]] ] <- clust_names[i]
    }
  
  
    ### Get DF ob BED information
    samarth_df[[sn]] <- data.frame(chr = zf_nepal2013_bed_info[[sn]]$chr, 
                             start = zf_nepal2013_bed_info[[sn]]$start,
                             end = zf_nepal2013_bed_info[[sn]]$end, 
                             strand = zf_nepal2013_bed_info[[sn]]$strand,
                             IQW = zf_nepal2013_bed_info[[sn]]$IQW,
                             domTPM = zf_nepal2013_bed_info[[sn]]$domTPM,
                             # phast = phastCons_scores[[sn]],
                             clust_ID = clust_lab)
}
## Ordered list lengths: 1785 2356 1039 1002 6473 5613
## Original list lengths:5613 1785 6473 2356 1002 1039
## Ordered list lengths: 66 34 315 5886 390 7155 3621 2585 53 116 571
## Original list lengths:7155 3621 390 5886 571 66 34 2585 53 315 116
## Ordered list lengths: 51 70 104 32 181 2503 2000 4383 1621 9021
## Original list lengths:4383 9021 2503 1621 2000 32 51 70 181 104

1.4 Cluster-wise BED files


if(do_bedfile_write){
    message("Task: Write cluster-wise BED files to disk")
    for(sn in sample_names){
        stopifnot(check_and_create_dir(result_dir_path[[sn]]))
        ##
        bedFilesPath <- file.path(result_dir_path[[sn]], "Cluster_BED_tracks")
        stopifnot(check_and_create_dir(bedFilesPath))
        
        message("Writing cluster BED track files at:", bedFilesPath)
        ##
        #### Write bed to disk -- each cluster in a separate bed file
        cat(paste0("\n\n### Individual cluster track BED files [", sn, "]\n\n"))
        for(lo in seq_along(seqs_clusters_as_list_ordered[[sn]])){
            ##
            bedFilename <- file.path(bedFilesPath,
                                  paste0("zf_nepal2013_TC_sample_", sn, 
                                  "_cluster", lo, ".bed"))
            # print(bedFilename)
            seq_ids <- seqs_clusters_as_list_ordered[[sn]][[lo]]
            limit_df <- samarth_df[[sn]][seq_ids,]
            track.name <- paste0(sn, "_danRer10_", "clust", lo)
            write_as_track_bed(limit_df, seq_ids, track_name = track.name, 
                               bedFilename = bedFilename)
            dload_text <- xfun::embed_file(path = bedFilename,  
                        text = paste("Download coordinates for cluster", lo, " as browser track"))
            cat(paste0("\n<", dload_text$name," href=\"", dload_text$attribs$href, "\" download=\"", dload_text$attribs$download, "\">", dload_text$children[[1]][1],
                "</a>\n"))
        }
        #####
        message("Preparing strand-separated files")
        ## Strand-wise separate files per cluster
        cat(paste0("\n\n### Strand-separated individual cluster track BED files[", sn, "]\n\n"))
        for(lo in seq_along(seqs_clusters_as_list_ordered[[sn]])){
            ## plus strand
            bedFilename_plus <- file.path(bedFilesPath,
                                paste0("zf_nepal2013_TC_sample_", sn, 
                                  "_cluster", lo, "_plus_strand.bed"))
            ##
            chosen_idx_plus_strand <- get_strand_specific_indices(df = samarth_df[[sn]],
                                seq_ids_in_clust = seqs_clusters_as_list_ordered[[sn]][[lo]],
                                strand_val = "+")
            ##
            if(length(chosen_idx_plus_strand) < 1){
              cat(paste0("<a href= >Empty",   "</a>"))
            }else{
            ##
              limit_df <- samarth_df[[sn]][chosen_idx_plus_strand,]
              track.name <- paste0(sn, "_danRer10_", "clust", lo, "_plus_strand")
              write_as_track_bed(limit_df, chosen_idx_plus_strand, track_name = track.name, 
                                 bedFilename = bedFilename_plus)
              dload_text <- xfun::embed_file(bedFilename_plus, 
                          text = paste("Download coordinates for cluster", 
                                       lo, "(+ strand) as browser track"))
              cat(paste0("\n<", dload_text$name," href=\"", dload_text$attribs$href, "\" download=\"", dload_text$attribs$download, "\">", dload_text$children[[1]][1],
                  "</a>,  "))
            }
            ## minus strand
            bedFilename_minus <- file.path(bedFilesPath,
                                     paste0("zf_nepal2013_TC_sample_",
                                            sn, "_cluster",
                                            lo, "_minus_strand.bed"))
            ##
            ##
            chosen_idx_minus_strand <- get_strand_specific_indices(df = samarth_df[[sn]],
                                seq_ids_in_clust = seqs_clusters_as_list_ordered[[sn]][[lo]],
                                strand_val = "-")
            ##
            if(length(chosen_idx_minus_strand) < 1){
              cat(paste0("<a href= >Empty",   "</a>\n"))
            }else{
              limit_df <- samarth_df[[sn]][chosen_idx_minus_strand,]
              track.name <- paste0(sn, "_danRer10_", "clust", lo, "_plus_strand")
              write_as_track_bed(limit_df, chosen_idx_minus_strand, track_name = track.name, 
                                 bedFilename = bedFilename_minus)
              dload_text <- xfun::embed_file(bedFilename_minus, 
                    text = paste("Download coordinates for cluster", 
                                       lo, "(- strand) as browser track"))
              cat(paste0("<", dload_text$name," href=\"", dload_text$attribs$href, "\" download=\"", dload_text$attribs$download, "\">", dload_text$children[[1]][1], "</a>\n"))
            }
            ##
        }
    }
    ##
    dload_all <- xfun::embed_dir(bedFilesPath, 
              text = paste("Download all clusters as separate browser track files (zip)"))
        cat(paste0("### All cluster track files (one zipped folder of all BED files)\n\n"))
        cat(paste0("\n<", dload_all$name," href=\"", dload_all$attribs$href, "\" download=\"", dload_all$attribs$download, "\">", dload_all$children[[1]][1],
                "</a>\n"))
    ##
}else{
    message("do_bedfile_write is FALSE")
}

1.5 IQW-TPM-PhastCons plots

####
if(do_IQW_TPM_plots){
    message("Task: Producing IQW_TPM_PhastCons scores combined plot")
    for(sn in sample_names){
        stopifnot(check_and_create_dir(result_dir_path[[sn]]))
        pl_iqw <- get_iqw_ord_plot(iqw = TRUE, y_axis_text = TRUE, 
                    samarth_df = samarth_df[[sn]], 
                    seqs_clust = seqs_clusters_as_list_ordered[[sn]])
        pl_iqw <- pl_iqw + ggeasy::easy_all_text_size(size=18) + 
                  NULL +
                  ggeasy::easy_y_axis_labels_size(size=18) +
                  ggeasy::easy_x_axis_labels_size(size=18) +
                  NULL
        ##
        pl_tpm <- get_iqw_ord_plot(tpm = TRUE, y_axis_text = FALSE, 
                    samarth_df = samarth_df[[sn]], 
                    seqs_clust = seqs_clusters_as_list_ordered[[sn]])
        pl_tpm <- pl_tpm + ggeasy::easy_all_text_size(size=18) + 
                  NULL +
                  # ggeasy::easy_y_axis_labels_size(size=18) +
                  ggeasy::easy_x_axis_labels_size(size=18) +
                  NULL
        ##
        # pl_phast <- get_iqw_ord_plot(phast = TRUE, y_axis_text = FALSE, 
        #             samarth_df = samarth_df[[sn]], 
        #             seqs_clust = seqs_clusters_as_list_ordered[[sn]])
        # pl_phast <- pl_phast + ggeasy::easy_all_text_size(size=18) + 
        #           NULL +
        #           # ggeasy::easy_y_axis_labels_size(size=18) +
        #           ggeasy::easy_x_axis_labels_size(size=18) +
        #           NULL
        ## IQW ordered plots
        iqw_tpm_ord_pls <- pl_iqw | pl_tpm
        # iqw_tpm_phast_ord_pls <- pl_iqw | pl_tpm | pl_phast
        perSample_pl[[sn]] <- iqw_tpm_ord_pls #iqw_tpm_phast_ord_pls
        
        # title_str <- paste0("Sample_", sn, "_IQW_TPM_phast_plot")
        title_str <- paste0("Sample_", sn, "_IQW_TPM_plot")
        pl_w_title <- perSample_pl[[sn]] + plot_annotation(title = title_str)
        cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
                paste0(title_str, ".pdf")),
                plot = pl_w_title,
                base_height = 12, base_width = 6
                )
        # cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
        #         paste0(title_str, ".png")),
        #         plot = pl_w_title,
        #         base_height = 17, base_width = 8
        #         )
        # ggsave(filename = file.path(result_dir_path[[sn]],
        #        paste0(title_str, ".pdf")),
        #        plot = pl_w_title,
        #        width = 10, height = 7)
        # ggsave(filename = file.path(result_dir_path[[sn]],
        #        paste0(title_str, ".png")),
        #        plot = pl_w_title, device = "png", dpi = 300,
        #        width = 10, height = 7)
        # print(pl_w_title)
        
    }
    
}else{
    message("do_iqw_tpm_plot is FALSE")
}

1.6 Sequence logos of cluster architectures

Sequence logos of clusters ordered by their median interquantile width (ascending order). These are included as part of the combined panels. See 64 cells stage, dome 30% epiboly stage and prim6 stage.


#fig.width=11, fig.height=10, out.width="1200px", out.height="800px",

##
if(do_plot_arch){
    use_ht <- list(15, 30, 20)
    names(use_ht) <- sample_names
    for(sn in sample_names){
        stopifnot(check_and_create_dir(result_dir_path[[sn]]))
        ##
        message("Generating architectures for clusters of sequences...")
        fname <- file.path(result_dir_path[[sn]],
                           paste0("Architectures_0-max.pdf"))
        iter5_zf_arch_list <- lapply(seq_along(perSample_archR_clusts[[sn]]), 
          function(y){
          x <- perSample_archR_clusts[[sn]][[y]]
          pl <- archR::plot_ggseqlogo_of_seqs(as.character(zf_2013_result[[sn]]$rawSeqs[x]), 
            pos_lab = -45:150, bits_yax = "auto", title = NULL)
          pl + theme(axis.text = element_text(size=10), 
                     # axis.text.x = element_text(angle=0, vjust = 2, hjust = 0.5),
                     # axis.text.x = element_text(angle=90, vjust = 0, hjust = 0),
                     axis.text.y = element_text(vjust = 0.5),
                     axis.title.y = element_text(size=10),
                     axis.ticks.length = unit(0.04, "cm"),
                     plot.margin = unit(c(0,0.1,-0.4,0.1), "cm")) +
            ggplot2::scale_y_continuous(sec.axis = dup_axis(name = paste0("C",y), labels = NULL))
          # pl + theme(plot.margin = unit(c(0,0.2,-0.5,0.1), "cm"))
          # pl + theme(plot.margin = unit(c(0,0,-0.3,0), "cm"))
          
        })
        # print(length(iter5_dm_arch_list))
        
        r1c1 <- cowplot::plot_grid(plotlist = iter5_zf_arch_list, ncol = 1)
        # cowplot::save_plot(fname, plot = r1c1, limitsize = FALSE, base_width = 30,
        #             nrow = length(seqs_clusters_as_list_ordered[[sn]]))
        cowplot::ggsave2(fname, plot= r1c1, width=25, height=use_ht[[sn]], units="cm",
              dpi = 600, limitsize = FALSE)
        
        ## save PNGs
        # for(p in 1:length(iter5_zf_arch_list)){
        #     this_result_dir_path <- file.path(result_dir_path[[sn]], "arch_png")
        #     stopifnot(check_and_create_dir(this_result_dir_path))
        #     fname <- file.path(this_result_dir_path, 
        #             paste0("Architecture_clust", p, "_0-max.png"))
        #     cowplot::ggsave2(fname, plot= iter5_zf_arch_list[[p]], 
        #       width=25, height=3, units="cm", dpi = 300)
        # }
        # r1c1
        perSample_arch_combined[[sn]] <- r1c1
        perSample_arch[[sn]] <- iter5_zf_arch_list
        
        if(file.exists(fname)){
            knitr::include_graphics(fname)
        }
    }
}else{
    message("do_plot_arch is FALSE")
}
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/64_cells_results
## Generating architectures for clusters of sequences...
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/dome_30perc_epiboly_results
## 
## Generating architectures for clusters of sequences...
## 
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/prim6_results
## 
## Generating architectures for clusters of sequences...
## 
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
##

1.7 Per sample list of ENTREZ IDs per cluster

perSample_entrezList <- lapply(seq_along(perSample_archR_clusts), function(x){
            as_df_entrez_id <- as.data.frame(perSample_peakAnno[[x]])$ENTREZID
            sam <- lapply(perSample_archR_clusts[[x]], function(y){
              temp <- as_df_entrez_id[y]
              temp[which(!is.na(temp))]
            })
            names(sam) <- paste0("C", seq_along(sam))
            sam
        })
names(perSample_entrezList) <- names(perSample_archR_clusts)

1.8 Per cluster gene names [ZF Stage 64 cells]

sn <- sample_names[1]

peakAnno_df <- as.data.frame(perSample_peakAnno[[sn]])
## peakAnno_df here doesn't have columns tpm.dominant_ctss
peakAnno_df$tpm.dominant_ctss <- zf_nepal2013_bed_info[[sn]]$domTPM

unique_genes <- unique(peakAnno_df$SYMBOL)
sumTPM_across_TC_per_gene <- lapply(unique_genes, function(x){
    sum(peakAnno_df$tpm.dominant_ctss[which(peakAnno_df$SYMBOL == x)])
})
names(sumTPM_across_TC_per_gene) <- unique_genes 

genes1 <- lapply(1:length(perSample_archR_clusts[[sn]]), function(x){
      ##
      all_genenames <-
        peakAnno_df[unlist(perSample_archR_clusts[[sn]][x]), c("GENENAME", "ENTREZID", "geneId", "SYMBOL", "annotation", "width", "tpm.dominant_ctss")]
      ## To sort by motif scores, get motif scores first
      motif_scores <- suppressMessages(get_motif_scores(zf_2013_result, perSample_archR_clusts, sn, clust_id = x))
      ##
      ##
      anno_terms1 <- all_genenames$annotation
      anno_terms_splits <- strsplit(anno_terms1, " ")
      anno_terms <- unlist(lapply(anno_terms_splits, function(x){
            if(is.na(x[1])){ "NA";
            }else if(grepl("Intron", x[1]) || grepl("Exon", x[1])){
                paste0(x[1], " (", paste(x[3:6], collapse = " "))
            }else if(grepl("Promoter", x[1])){
                x[1]
            }else{ paste(x[1], x[2]) }
      }))
      ##
      sorted <- sort(motif_scores, index.return=TRUE,
                      decreasing = TRUE)
      ##
      ##
      ## To sort alphabetically by Gene Symbol
      naStr <- unlist(lapply(all_genenames$SYMBOL, function(x){
              if(is.na(x)){
                  "NA"
              }else{
                  x
              }
            }))
      all_genenames$SYMBOL <- naStr
      sorted <- sort(all_genenames$SYMBOL, index.return=TRUE)
      
      sam2 <- unlist(lapply(all_genenames$SYMBOL, function(x) {
          if(x != "NA"){
            sumTPM_across_TC_per_gene[[x]]
          }else{
            NA
          }
        }))
      ##
      # Displays as datatable which is sortable, searchable
        print_df <- data.frame("Symbol" = all_genenames$SYMBOL[sorted$ix],
                         "Weblink" = 
            paste0('<a href="https://www.ensembl.org/Danio_rerio/Gene/Summary?db=core;g=', 
                all_genenames$geneId[sorted$ix], '" target="_blank">',
                all_genenames$GENENAME[sorted$ix], 
          '</a> '),
            "Annot" = anno_terms[sorted$ix],
            "domTPM" = format(round(all_genenames$tpm.dominant_ctss[sorted$ix], 3), nsmall=3),
            "%%.domTPM" = format(round(100*(all_genenames$tpm.dominant_ctss[sorted$ix]/sam2[sorted$ix]), 2), nsmall=2),
            "Match.score"= form_str(motif_scores[sorted$ix])
            )
        colnames(print_df)[5] <- "%domTPM"
        return(print_df)
        # Displays as general html text
      paste0('--<span style="color:red">', all_genenames$SYMBOL[sorted$ix], '</span> ',
        '<a href="https://www.ensembl.org/Danio_rerio/Gene/Summary?db=core;g=', 
              all_genenames$geneId[sorted$ix], '" target="_blank">',
              all_genenames$GENENAME[sorted$ix], 
        '</a> ',
        '<span style="color:green">', form_str(sorted$x), '</span> ',
       collapse=" <br/> ")
    })

genes <- lapply(genes1, function(x){
    DT::datatable(x, escape = c(1), rownames = FALSE, width = 600, height = 1200, 
                  options = list(paging= FALSE), 
                  caption = paste0("TSS info [", nrow(x), " entries] sorted by the gene symbol"))
})


# MatchScores <- unlist(lapply(1:length(perSample_archR_clusts[[sn]]), function(x){
#       ## To sort by motif scores, get motif scores first 
#       motif_scores <- suppressMessages(get_motif_scores(zf_2013_result, perSample_archR_clusts, sn, clust_id = x))
#       ##
#       sorted <- sort(motif_scores, index.return=TRUE, 
#                       decreasing = TRUE)
#       ##
#       paste0('<a href="">', form_str(sorted$x), 
#         '</a>', collapse=", <br/> ")
#     }))
# 
#  message(length(genes), " & ", length(MatchScores))

1.9 Per cluster gene names [ZF Stage dome 30% epiboly]

sn <- sample_names[2]

peakAnno_df <- as.data.frame(perSample_peakAnno[[sn]])
## peakAnno_df here doesn't have columns tpm.dominant_ctss
peakAnno_df$tpm.dominant_ctss <- zf_nepal2013_bed_info[[sn]]$domTPM

unique_genes <- unique(peakAnno_df$SYMBOL)
sumTPM_across_TC_per_gene <- lapply(unique_genes, function(x){
    sum(peakAnno_df$tpm.dominant_ctss[which(peakAnno_df$SYMBOL == x)])
})
names(sumTPM_across_TC_per_gene) <- unique_genes 


genes1 <- lapply(seq_along(perSample_archR_clusts[[sn]]), function(x){
      ##
      all_genenames <-
        peakAnno_df[unlist(perSample_archR_clusts[[sn]][x]), c("GENENAME", "ENTREZID", "geneId", "SYMBOL", "annotation", "width", "tpm.dominant_ctss")]
      ## To sort by motif scores, get motif scores first
      motif_scores <- suppressMessages(get_motif_scores(zf_2013_result, perSample_archR_clusts, sn, clust_id = x))
      ##
      ##
      anno_terms1 <- all_genenames$annotation
      anno_terms_splits <- strsplit(anno_terms1, " ")
      anno_terms <- unlist(lapply(anno_terms_splits, function(x){
            if(is.na(x[1])){ "NA";
            }else if(grepl("Intron", x[1]) || grepl("Exon", x[1])){
                paste0(x[1], " (", paste(x[3:6], collapse = " "))
            }else if(grepl("Promoter", x[1])){
                x[1]
            }else{ paste(x[1], x[2]) }
      }))
      ##
      sorted <- sort(motif_scores, index.return=TRUE,
                      decreasing = TRUE)
      
      ##
      ## To sort alphabetically by Gene Symbol
      naStr <- unlist(lapply(all_genenames$SYMBOL, function(x){
              if(is.na(x)){
                  "NA"
              }else{
                  x
              }
            }))
      all_genenames$SYMBOL <- naStr
      sorted <- sort(all_genenames$SYMBOL, index.return=TRUE)
      
      sam2 <- unlist(lapply(all_genenames$SYMBOL, function(x) {
          if(x != "NA"){
            sumTPM_across_TC_per_gene[[x]]
          }else{
            NA
          }
        }))
      
      ##
      # Displays as datatable which is sortable, searchable
        print_df <- data.frame("Symbol" = all_genenames$SYMBOL[sorted$ix],
                         "Weblink" = 
            paste0('<a href="https://www.ensembl.org/Danio_rerio/Gene/Summary?db=core;g=', 
                all_genenames$geneId[sorted$ix], '" target="_blank">',
                all_genenames$GENENAME[sorted$ix], 
          '</a> '),
            "Annot" = anno_terms[sorted$ix],
            "domTPM" = format(round(all_genenames$tpm.dominant_ctss[sorted$ix], 3), nsmall=3),
            "%%.domTPM" = format(round(100*(all_genenames$tpm.dominant_ctss[sorted$ix]/sam2[sorted$ix]), 2), nsmall=2),
            "Match.score"= form_str(motif_scores[sorted$ix])
            )
        colnames(print_df)[5] <- "%domTPM"
        return(print_df)
        # Displays as general html text
      paste0('--<span style="color:red">', all_genenames$SYMBOL[sorted$ix], '</span> ',
        '<a href="https://www.ensembl.org/Danio_rerio/Gene/Summary?db=core;g=', 
              all_genenames$geneId[sorted$ix], '" target="_blank">',
              all_genenames$GENENAME[sorted$ix], 
        '</a> ',
        '<span style="color:green">', form_str(sorted$x), '</span> ',
       collapse=" <br/> ")
    })

genes <- lapply(genes1, function(x){
    DT::datatable(x, escape = c(1), rownames = FALSE, width = 600, height = 1200, 
                  options = list(paging= FALSE), 
                  caption = paste0("TSS info [", nrow(x), " entries] sorted by the gene symbol"))
})


# MatchScores <- unlist(lapply(1:length(perSample_archR_clusts[[sn]]), function(x){
#       ## To sort by motif scores, get motif scores first 
#       motif_scores <- suppressMessages(get_motif_scores(zf_2013_result, perSample_archR_clusts, sn, clust_id = x))
#       ##
#       sorted <- sort(motif_scores, index.return=TRUE, 
#                       decreasing = TRUE)
#       ##
#       paste0('<a href="">', form_str(sorted$x), 
#         '</a>', collapse=", <br/> ")
#     }))
# 
# # message(length(genes), " & ", length(MatchScores))

1.10 Per cluster gene names [ZF Stage prim6]

sn <- sample_names[3]

peakAnno_df <- as.data.frame(perSample_peakAnno[[sn]])
## peakAnno_df here doesn't have columns tpm.dominant_ctss
peakAnno_df$tpm.dominant_ctss <- zf_nepal2013_bed_info[[sn]]$domTPM

unique_genes <- unique(peakAnno_df$SYMBOL)
sumTPM_across_TC_per_gene <- lapply(unique_genes, function(x){
    sum(peakAnno_df$tpm.dominant_ctss[which(peakAnno_df$SYMBOL == x)])
})
names(sumTPM_across_TC_per_gene) <- unique_genes 


genes1 <- lapply(1:length(perSample_archR_clusts[[sn]]), function(x){
      ##
      all_genenames <-
        peakAnno_df[unlist(perSample_archR_clusts[[sn]][x]), c("GENENAME", "ENTREZID", "geneId", "SYMBOL", "annotation", "width", "tpm.dominant_ctss")]
      ## To sort by motif scores, get motif scores first
      motif_scores <- suppressMessages(get_motif_scores(zf_2013_result, perSample_archR_clusts, sn, clust_id = x))
      ##
      ##
      anno_terms1 <- all_genenames$annotation
      anno_terms_splits <- strsplit(anno_terms1, " ")
      anno_terms <- unlist(lapply(anno_terms_splits, function(x){
            if(is.na(x[1])){ "NA";
            }else if(grepl("Intron", x[1]) || grepl("Exon", x[1])){
                paste0(x[1], " (", paste(x[3:6], collapse = " "))
            }else if(grepl("Promoter", x[1])){
                x[1]
            }else{ paste(x[1], x[2]) }
      }))
      ##
      sorted <- sort(motif_scores, index.return=TRUE,
                      decreasing = TRUE)
      
      ##
      ## To sort alphabetically by Gene Symbol
      naStr <- unlist(lapply(all_genenames$SYMBOL, function(x){
              if(is.na(x)){
                  "NA"
              }else{
                  x
              }
            }))
      all_genenames$SYMBOL <- naStr
      sorted <- sort(all_genenames$SYMBOL, index.return=TRUE)
      
      sam2 <- unlist(lapply(all_genenames$SYMBOL, function(x) {
          if(x != "NA"){
            sumTPM_across_TC_per_gene[[x]]
          }else{
            NA
          }
        }))
      
      ##
      # Displays as datatable which is sortable, searchable
        print_df <- data.frame("Symbol" = all_genenames$SYMBOL[sorted$ix],
                         "Weblink" = 
            paste0('<a href="https://www.ensembl.org/Danio_rerio/Gene/Summary?db=core;g=', 
                all_genenames$geneId[sorted$ix], '" target="_blank">',
                all_genenames$GENENAME[sorted$ix], 
          '</a> '),
            "Annot" = anno_terms[sorted$ix],
            "domTPM" = format(round(all_genenames$tpm.dominant_ctss[sorted$ix], 3), nsmall=3),
            "%%.domTPM" = format(round(100*(all_genenames$tpm.dominant_ctss[sorted$ix]/sam2[sorted$ix]), 2), nsmall=2),
            "Match.score"= form_str(motif_scores[sorted$ix])
            )
        colnames(print_df)[5] <- "%domTPM"
        return(print_df)
        # Displays as general html text
      paste0('--<span style="color:red">', all_genenames$SYMBOL[sorted$ix], '</span> ',
        '<a href="https://www.ensembl.org/Danio_rerio/Gene/Summary?db=core;g=', 
              all_genenames$geneId[sorted$ix], '" target="_blank">',
              all_genenames$GENENAME[sorted$ix], 
        '</a> ',
        '<span style="color:green">', form_str(sorted$x), '</span> ',
       collapse=" <br/> ")
    })

genes <- lapply(genes1, function(x){
    DT::datatable(x, escape = c(1), rownames = FALSE, width = 600, height = 1200, 
                  options = list(paging= FALSE), 
                  caption = paste0("TSS info [", nrow(x), " entries] sorted by the gene symbol"))
})


# MatchScores <- unlist(lapply(1:length(perSample_archR_clusts[[sn]]), function(x){
#       ## To sort by motif scores, get motif scores first
#       motif_scores <- suppressMessages(get_motif_scores(zf_2013_result, perSample_archR_clusts, sn, clust_id = x))
#       ##
#       sorted <- sort(motif_scores, index.return=TRUE,
#                       decreasing = TRUE)
#       ##
#       paste0('<a href="">', form_str(sorted$x),
#         '</a>', collapse=", <br/> ")
#     }))

1.11 Figures for the paper

1.11.1 For paper: IQW-TPM-PhastCons plots

suffix_label <- c("X", "Y", "Z")
names(suffix_label) <- sample_names
use_txt_size <- c(20, 20 ,20)
names(use_txt_size) <- sample_names
use_axis_text_size <- 35
use_axis_title_size <- 40

####
if(do_IQW_TPM_plots){
    message("Task: Producing IQW_TPM_PhastCons scores combined plot")
    for(sn in sample_names){
        stopifnot(check_and_create_dir(result_dir_path[[sn]]))
        pl_iqw <- get_iqw_ord_plot(iqw = TRUE, y_axis_text = TRUE, 
                    samarth_df = samarth_df[[sn]], text_size = use_axis_title_size,
                    seqs_clust = seqs_clusters_as_list_ordered[[sn]], 
                    use_suffix = suffix_label[sn], use_notch = FALSE)
        # pl_iqw <- pl_iqw + ggeasy::easy_all_text_size(size=18) + 
        #           NULL +
        #           ggeasy::easy_y_axis_labels_size(size=18) +
        #           ggeasy::easy_x_axis_labels_size(size=18) +
        #           NULL
        ##
        pl_tpm <- get_iqw_ord_plot(tpm = TRUE, y_axis_text = FALSE, 
                    samarth_df = samarth_df[[sn]], text_size = use_axis_title_size,
                    seqs_clust = seqs_clusters_as_list_ordered[[sn]], 
                    use_suffix = suffix_label[sn], use_notch = FALSE)
        # pl_tpm <- pl_tpm + ggeasy::easy_all_text_size(size=18) + 
        #           NULL +
        #           # ggeasy::easy_y_axis_labels_size(size=18) +
        #           ggeasy::easy_x_axis_labels_size(size=18) +
        #           NULL
        ##
        # pl_phast <- get_iqw_ord_plot(phast = TRUE, y_axis_text = FALSE, 
        #             samarth_df = samarth_df[[sn]], 
        #             seqs_clust = seqs_clusters_as_list_ordered[[sn]])
        # pl_phast <- pl_phast + ggeasy::easy_all_text_size(size=18) + 
        #           NULL +
        #           # ggeasy::easy_y_axis_labels_size(size=18) +
        #           ggeasy::easy_x_axis_labels_size(size=18) +
        #           NULL
        ## IQW ordered plots
        iqw_tpm_ord_pls <- pl_iqw | pl_tpm
        # iqw_tpm_phast_ord_pls <- pl_iqw | pl_tpm | pl_phast
        perSample_pl[[sn]] <- iqw_tpm_ord_pls #iqw_tpm_phast_ord_pls
        
        # title_str <- paste0("Sample_", sn, "_IQW_TPM_phast_plot")
        title_str <- paste0("Sample_", sn, "_IQW_TPM_plot")
        pl_w_title <- perSample_pl[[sn]] + plot_annotation(title = title_str)
        cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
                paste0(title_str, "_paper.pdf")),
                plot = perSample_pl[[sn]], 
                base_height = 10, base_width = 15
                )
        # cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
        #         paste0(title_str, ".png")),
        #         plot = pl_w_title,
        #         base_height = 17, base_width = 8
        #         )
        # ggsave(filename = file.path(result_dir_path[[sn]],
        #        paste0(title_str, ".pdf")),
        #        plot = pl_w_title,
        #        width = 10, height = 7)
        # ggsave(filename = file.path(result_dir_path[[sn]],
        #        paste0(title_str, ".png")),
        #        plot = pl_w_title, device = "png", dpi = 300,
        #        width = 10, height = 7)
        # print(pl_w_title)
        
    }
    
}else{
    message("do_iqw_tpm_plot is FALSE")
}

1.11.2 For paper: Sequence logos of cluster architectures

Sequence logos of clusters ordered by their median interquantile width (ascending order). These are included as part of the combined panels. See 64 cells stage, dome 30% epiboly stage and prim6 stage.


#fig.width=11, fig.height=10, out.width="1200px", out.height="800px",
##
if(do_plot_arch){
    use_ht <- list(15, 30, 20)
    names(use_ht) <- sample_names
    for(sn in sample_names){
        stopifnot(check_and_create_dir(result_dir_path[[sn]]))
        ##
        message("Generating architectures for clusters of sequences...")
        fname <- file.path(result_dir_path[[sn]],
                           paste0("Architectures_0-max_paper.pdf"))
        iter5_zf_arch_list <- lapply(seq_along(perSample_archR_clusts[[sn]]), 
          function(y){
          x <- perSample_archR_clusts[[sn]][[y]]
          pl <- archR::plot_ggseqlogo_of_seqs(as.character(zf_2013_result[[sn]]$rawSeqs[x]), 
            pos_lab = -45:150, bits_yax = "auto", title = NULL, xt_freq = 15)
          pl <- pl + theme_classic() +  
                    theme(axis.text = element_text(size=use_axis_text_size),
                     axis.text.x = element_text(angle=0, vjust = 0.5, hjust = 0.5, size = use_axis_text_size),
                     axis.text.y = element_text(hjust = -1, size = use_axis_text_size),
                     # axis.text.y.left = element_text(hjust = -5), 
                     # axis.text.y.left = element_text(vjust = 0.5, hjust = 1.5), 
                     axis.title.y = element_text(size = use_axis_title_size),# axis.line.y.right = element_blank(),
                     axis.title.y.left = element_text(margin = unit(c(0, 0, 0, 0.1), "cm")),
                     axis.title.y.right = element_text(margin = unit(c(0, 0, 0, 0.5), "cm")),
                     axis.ticks.length.y.left = unit(0.15, "cm"),
                     axis.ticks.length.y.right = unit(0.00, "cm"),
                     axis.ticks.length.x.bottom = unit(0.2, "cm"),
                     axis.ticks = element_line(size=0.4),
                     # plot.margin = unit(c(0,0,0,0), "cm")
                     plot.margin = unit(c(0,0,0,0.4), "cm")
                     ) +
              # theme(axis.text = element_text(size=use_txt_size), 
              #        # axis.text.x = element_text(angle=0, vjust = 2, hjust = 0.5),
              #        # axis.text.x = element_text(angle=90, vjust = 0, hjust = 0),
              #        axis.text.y = element_text(vjust = 0.5),
              #        axis.title.y = element_text(size=use_txt_size),
              #        axis.ticks.length = unit(0.04, "cm"),
              #        plot.margin = unit(c(0,0.1,-0.4,0.1), "cm")) +
            ggplot2::scale_y_continuous(breaks = seq(0.0, 2.0, by = 0.5),
                                        labels = scales::number_format(accuracy = 0.1),
                                        sec.axis = dup_axis(name = paste0("C",y, suffix_label[sn]),
                                                            labels = NULL))
          # pl + theme(plot.margin = unit(c(0,0.2,-0.5,0.1), "cm"))
          # pl + theme(plot.margin = unit(c(0,0,-0.3,0), "cm"))
          if(y == 1){
                pl <- pl + theme(plot.margin = unit(c(0,0,0,0.4), "cm"))
            }else{
                pl <- pl + theme(plot.margin = unit(c(0,0,0,0.4), "cm"))
            }
          
          pl
        })
        
        ## keep x-axis text for only few plots
        iter5_zf_arch_list <- lapply(seq_along(iter5_zf_arch_list), function(x){
            foo <- iter5_zf_arch_list[[x]] #+ ggplot2::theme(title = element_blank())
            if(x != length(iter5_zf_arch_list)){
                foo2 <- foo + ggplot2::theme(
                            axis.text.x.bottom = element_blank(),
                            axis.ticks.length.y.left = ggplot2::unit(0.15, "cm"),
                            axis.ticks.length.x.bottom = ggplot2::unit(0.2, "cm"),
                            plot.margin = ggplot2::unit(c(0.2, 0.3, -0.01, 0.1), "cm"))
                return(foo2)
            }else{
                foo2 <- foo + ggplot2::theme(
                            plot.margin = ggplot2::unit(c(0.1, 0.3, -1, 0.1), "cm"))
                return(foo2)
            }
            })
        
        ######### Zoomed plot for prim6 stage
        message("Nb of plots: ", length(iter5_zf_arch_list))
        if(sn == "prim6"){
            message("====Creating zoomed in plot====")
            y <- length(perSample_archR_clusts[[sn]])
            x <- perSample_archR_clusts[[sn]][[y]]
            ##
            sub_seqs <- Biostrings::subseq(zf_2013_result[[sn]]$rawSeqs[x], 
                start = 91, 
                end = Biostrings::width(zf_2013_result[[sn]]$rawSeqs[1]))
            ##
            pl <- seqArchR::plot_ggseqlogo_of_seqs(sub_seqs, 
            pos_lab = 45:150, bits_yax = "auto", title = NULL, xt_freq = 5)
            zoomed_pl <- pl + theme_classic() +  
                ggplot2::scale_x_continuous(breaks = seq(1, 106, by = 5), 
                                            labels = seq(45, 150, by = 5)) + 
                ggplot2::scale_y_continuous(breaks = seq(0.0, 0.5, by = 0.05),
                                        labels = scales::number_format(accuracy = 0.05),
                                        sec.axis = dup_axis(name = paste0("C",y, suffix_label[sn]),
                                                            labels = NULL)) + 
                theme(axis.text = element_text(size=use_axis_text_size),
                     axis.text.x = element_text(angle=0, vjust = 0.5, hjust = 0.5),
                     axis.text.y = element_text(hjust = -1),
                     # axis.text.y.left = element_text(hjust = -5), 
                     # axis.text.y.left = element_text(vjust = 0.5, hjust = 1.5), 
                     axis.title.y = element_text(size=use_axis_title_size),# axis.line.y.right = element_blank(),
                     axis.title.y.left = element_text(margin = unit(c(0, 0, 0, 0.1), "cm")),
                     axis.title.y.right = element_text(margin = unit(c(0, 0, 0, 0.5), "cm")),
                     axis.ticks.length.y.left = unit(0.15, "cm"),
                     axis.ticks.length.y.right = unit(0.00, "cm"),
                     axis.ticks.length.x.bottom = unit(0.2, "cm"),
                     axis.ticks = element_line(size=0.4),
                     # plot.margin = unit(c(0,0,0,0), "cm")
                     plot.margin = unit(c(0,0,0,0), "cm")
                     )
            

        }
        message("Nb of plots: ", length(iter5_zf_arch_list))
        #########
        
        # print(length(iter5_dm_arch_list))
        
        r1c1 <- cowplot::plot_grid(plotlist = iter5_zf_arch_list, ncol = 1, 
                                   nrow = length(perSample_archR_clusts[[sn]]))
        cowplot::save_plot(fname, plot = r1c1, limitsize = FALSE,
                           base_height = 2, base_width = 21,
                           ncol = 1,
                           nrow = length(perSample_archR_clusts[[sn]]),
                           dpi = 600)
        # cowplot::ggsave2(fname, plot= r1c1, width=25, height=use_ht[[sn]], units="cm",
        #       dpi = 600, limitsize = FALSE)
        
        # ## save PNGs
        # for(p in 1:length(iter5_zf_arch_list)){
        #     this_result_dir_path <- file.path(result_dir_path[[sn]], "arch_png")
        #     stopifnot(check_and_create_dir(this_result_dir_path))
        #     fname <- file.path(this_result_dir_path, 
        #             paste0("Architecture_clust", p, "_0-max.png"))
        #     cowplot::ggsave2(fname, plot= iter5_zf_arch_list[[p]], 
        #       width=25, height=3, units="cm", dpi = 300)
        # }
        # r1c1
        perSample_arch_combined[[sn]] <- r1c1
        perSample_arch[[sn]] <- iter5_zf_arch_list
        
        if(file.exists(fname)){
            knitr::include_graphics(fname)
        }
    }
}else{
    message("do_plot_arch is FALSE")
}
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/64_cells_results
## Generating architectures for clusters of sequences...
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Nb of plots: 6
## 
## Nb of plots: 6
## 
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/dome_30perc_epiboly_results
## 
## Generating architectures for clusters of sequences...
## 
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Nb of plots: 11
## 
## Nb of plots: 11
## 
## Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/prim6_results
## 
## Generating architectures for clusters of sequences...
## 
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Nb of plots: 10
## 
## ====Creating zoomed in plot====
## 
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Nb of plots: 10
##

1.11.3 Fpr paper: Overlaps

overlapTypeInfo <- vector("list", length(sample_names))
names(overlapTypeInfo) <- sample_names


iter_sample_names <- c(1,2,3,1,2)

for(s in seq_along(sample_names)){
  this_clusters <- perSample_archR_clusts[[sample_names[s]]]
  overlapTypeInfo[[s]] <- data.frame(
        clust = rep("0", length(unlist(this_clusters))),  
        Type = rep("N", length(unlist(this_clusters)))
          )
  perCluster_overlaps <- vector("list", length(this_clusters))
  for(l in seq_along(this_clusters)){
      sam <- this_clusters[[l]]
      overlapTypeInfo[[s]][sam, "clust"] <- l
      ## overlapping with next sample/s/s+1
      Atemp <- IRanges::overlapsAny(
        query = GenomicRanges::promoters(
            perSample_CAGEobj[[ sample_names[[iter_sample_names[s]]] ]][sam],
            upstream = 45, downstream = 45),
        subject = GenomicRanges::promoters(
            perSample_CAGEobj[[ sample_names[[iter_sample_names[s+1] ]] ]],
            upstream = 45, downstream = 45))
      A <- which(Atemp == TRUE)
      Anb <- length(A)
      # print(A)
      # print(Anb)
      useIdx <- this_clusters[[l]][A]
      overlapTypeInfo[[s]][useIdx, "Type"] <- "1-2"
      
      ## overlapping with next to next sample/s/s+2
      Btemp <- IRanges::overlapsAny(
        query = GenomicRanges::promoters(
            perSample_CAGEobj[[ sample_names[[iter_sample_names[s]]] ]][sam],
            upstream = 45, downstream = 45),
        subject = GenomicRanges::promoters(
            perSample_CAGEobj[[ sample_names[[iter_sample_names[s+2] ]] ]],
            upstream = 45, downstream = 45))
      ##
      B <- which(Btemp == TRUE)
      Bnb <- length(B)
      # print(B)
      # print(Bnb)
      useIdx <- this_clusters[[l]][B]
      overlapTypeInfo[[s]][useIdx, "Type"] <- "1-3"
      
      ## unique to self
      nonA <- which(Atemp == FALSE)
      nonB <- which(Btemp == FALSE)
      C <- intersect(nonA, nonB)
      Cnb <- length(C)
      # print(C)
      # print(Cnb)
      useIdx <- this_clusters[[l]][C]
      overlapTypeInfo[[s]][useIdx, "Type"] <- "1"
      
      ## Common all
      inAll <- intersect(A,B)
      Dnb <- length(inAll)
      # print(inAll)
      # print(Dnb)
      useIdx <- this_clusters[[l]][inAll]
      overlapTypeInfo[[s]][useIdx, "Type"] <- "1-2-3"
      
      ##
      # message(paste(length(sam), Anb, Bnb, Cnb, Dnb, sum(Anb,Bnb,Cnb,Dnb), collapse=", "))
      # print(Bnb)
      # print(Cnb)
      perCluster_overlaps[[l]] <- c(Anb, Bnb, Cnb, Dnb)
  }
  perCluster_overlaps_perSample[[sample_names[s]]] <- do.call(rbind, perCluster_overlaps)
}


use_labs <- list(c("64C", "64C & D", "All", "64C & P6"), 
             c("D", "D & P6", "All", "64C & D"),
             c("P6", "64C & P6", "All", "D & P6")
            )
names(use_labs) <- sample_names
bar_pls <- vector("list", length(sample_names))
names(bar_pls) <- sample_names

for(sn in sample_names){
    # result_dir_path <- file.path(archR_org_results_path, paste0(sn, "_results"))
    # stopifnot(check_and_create_dir(result_dir_path))
    
    clust_labs <- unlist(lapply(seq_along(perSample_archR_clusts[[sn]]),
      function(x){
        paste0("C", x, suffix_label[[sn]] 
            # " (", length(perSample_archR_clusts[[sn]][[x]]), ")"
            )
        }))
     
    bar_pl <- ggplot(overlapTypeInfo[[sn]], 
      aes(y = fct_rev(reorder(clust, as.numeric(clust))), fill = Type)) + 
      geom_bar(position = position_fill(reverse=TRUE),
                width = 0.6) +
      ggplot2::theme_bw() + 
      ggplot2::scale_fill_discrete(name = "", 
                                   labels = use_labs[[sn]]) +
      ggplot2::scale_x_continuous("Proportion", limits = c(0,1)) +
      # ggplot2::ylab(NULL) + 
      ggplot2::scale_y_discrete("", position = "left",
                        labels =  rev(clust_labs)) +
      ggeasy::easy_legend_at("top") +
      ggplot2::theme(legend.text = element_text(size = 10)) + 
      ggplot2::guides(fill = guide_legend(ncol = 2, byrow = TRUE)) +
      ggeasy::easy_all_text_size(size=10)
    bar_pls[[sn]] <- bar_pl
}

overlap_grid_pl <- cowplot::plot_grid(plotlist = bar_pls, nrow = 1, align = "h")

cowplot::save_plot(filename = file.path(result_dir_path[[sn]], "zf_figure_showing_overlaps_for_stages.pdf"),
                plot = overlap_grid_pl, ncol = 3, nrow = 2,
                base_height = 2, base_width = 3, limitsize=FALSE)

1.11.4 For paper: Chr4 vs Other chromosomes proportion

cl_char <- c("X", "Y", "Z")
pl_list <- vector("list", length(sample_names))
for(sn in seq_along(sample_names)){
    seqArchRplus::per_cluster_strand_dist(sname = sample_names[[sn]], 
        clusts = perSample_archR_clusts[[sn]], 
        info_df = samarth_df[[sn]], dir_path = result_dir_path[[sn]])
    
    
    info_df <- samarth_df[[sn]]
    clusts <- perSample_archR_clusts[[sn]]
    
    chr4_df_list <- lapply(seq_along(clusts), function(x){
        df_strand <- data.frame(
            "chr" = info_df$chr[clusts[[x]]],
            "strand" = info_df$strand[clusts[[x]]]
        )
        df_strand_tab <- table(df_strand)
        chr4_df <- data.frame(Chromosome = c("Chr4", "Other"), 
                              Freq = c(0, 0))
        total <- sum(df_strand_tab)
        chr4_df$Freq[1] <- if("chr4" %in% rownames(df_strand_tab)) sum(df_strand_tab["chr4",]) else 0
        chr4_df$Freq[2] <- total - chr4_df$Freq[1]
        chr4_df$Proportion <- chr4_df$Freq / total
        return(chr4_df)
    })
    
    
    chr4_df <- do.call(rbind, chr4_df_list)
    
    chr4_df$Cluster <- rep(paste0("C", seq(chr4_df_list), cl_char[sn]), each=2)
    
    pl_list[[sn]] <- ggpubr::ggbarplot(chr4_df, x = "Cluster", y = "Proportion", 
        ylim = c(0,1.0), palette = "Paired2", fill = "Chromosome", 
        order = paste0("C", rev(seq(chr4_df_list)), cl_char[sn]), 
        width = 0.7, orientation = "horizontal", legend = "top")
    
    cowplot::ggsave2(filename = file.path(result_dir_path[[sn]], 
                                            "Chr4_OtherChr_proportions.pdf"), 
                    plot = pl_list[[sn]], device = "pdf", width = 7, 
                    height = 5)
}
## 
## ── All clusters' strand distributions ──────────────────────────────────────────
## 
## ── Sample: 64_cells ──
## 
## ! Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/64_cells_results/64_cells_results
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## 
## 
## ── All clusters' strand distributions ──────────────────────────────────────────
## 
## 
## 
## ── Sample: dome_30perc_epiboly ──
## 
## 
## 
## ! Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/dome_30perc_epiboly_results/dome_30perc_epiboly_results
## 
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## 
## 
## ── All clusters' strand distributions ──────────────────────────────────────────
## 
## 
## 
## ── Sample: prim6 ──
## 
## 
## 
## ! Directory exists: results/zebrafish-nepal2013/with_archR_v0.1.8/prim6_results/prim6_results
## 
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
grid_pl <- cowplot::plot_grid(pl_list[[1]] + ggplot2::theme(legend.position = "none"),
                              pl_list[[2]] + ggplot2::theme(legend.position = "none"),
                              pl_list[[3]] + ggplot2::theme(legend.position = "none"),
                                ncol = 3)

legend <- cowplot::get_legend(
  # create some space to the left of the legend
  pl_list[[1]] + theme(legend.box.margin = margin(0, 0, 0, 12))
)


grid_pl <- cowplot::plot_grid(legend, grid_pl, 
                rel_heights = c(0.07,1), 
                ncol = 1)
cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
                                paste0("zebrafish_figure_paper_chr4_proportion_all_samples.pdf")),
                plot = grid_pl, ncol = 3, 
                base_height = 5, base_width = 3,
                limitsize=FALSE, dpi = 300, device = "pdf")

1.11.5 For paper: All of it together

## fig.width=18, fig.height=12, warning=FALSE,results='hide'
## use cowplot to arrange plots
txdb_ensGene_fname <- file.path(archR_org_data_path, paste0("TxDb.Drerio.UCSC.danRer10.ensGene_self.2021-06-19.sqlite"))
if(!file.exists(txdb_ensGene_fname)){
    txdb_ensGene_ucsc <- GenomicFeatures::makeTxDbFromUCSC(genome="danRer10",
                                                       tablename="ensGene")
    AnnotationDbi::saveDb(txdb_ensGene_ucsc, file= txdb_ensGene_fname)
}else{
    txdb_ensGene_ucsc <- AnnotationDbi::loadDb(txdb_ensGene_fname)
}

use_ht <- list(1.5, 1.5, 2)
use_wd <- list(14, 14, 18)
# use_axis_text_size <- 25
# use_axis_title_size <- 30
use_legend_text_size <- list(45, 45, 25)
names(use_legend_text_size) <- sample_names
    
names(use_ht) <- sample_names
names(use_wd) <- sample_names

panel_pls <- vector("list", length(sample_names))
names(panel_pls) <- sample_names

for(sn in sample_names){
    
    stopifnot(check_and_create_dir(result_dir_path[[sn]]))
    
    clust_labs <- unlist(lapply(seq_along(perSample_archR_clusts[[sn]]),
      function(x){
        paste0(x, "_(n=", length(perSample_archR_clusts[[sn]][[x]]), ")")
        }))
    
    clustwise_anno <- lapply(perSample_archR_clusts[[sn]], function(x){
        foo_anno <- ChIPseeker::annotatePeak(perSample_CAGEobj[[sn]][x,],
                                             tssRegion=c(-500, 100),
                                             TxDb = txdb_ensGene_ucsc,
                                             annoDb = "org.Dr.eg.db")
        foo_anno
    })
    names(clustwise_anno) <- seq(1, length(perSample_archR_clusts[[sn]]))
    
    ## Solution using custom plotting
    features_order <- c("Promoter", "5' UTR", "1st Exon", "Other Exon",
     "1st Intron", "Other Intron", "Distal Intergenic", "3' UTR", 
        "Downstream (<=300)")
    ## 
    sam <- dplyr::bind_rows(lapply(clustwise_anno, function(x) x@annoStat), .id = "clust")
    
    colrs <- RColorBrewer::brewer.pal(n = 9, name = "Paired")
    
    levels(sam$Feature) <- features_order
    
    names(colrs) <- features_order

    clustwise_annobar <- ggplot(sam,
      aes(y = fct_rev(reorder(clust, as.numeric(clust))), x = Frequency, fill = Feature)) +
      geom_bar(stat = "identity", width = 0.6, position = position_fill(reverse=TRUE)) +
      ggplot2::scale_fill_manual(name = "", values = colrs) +
      ggplot2::theme_classic() +
      ggplot2::ylab(NULL) +
      ggplot2::xlab("Proportion") + 
      ggplot2::theme(
                     axis.text.y = element_blank(),
                     axis.ticks.length.y.left = unit(0.1, units = "cm"),
                     axis.ticks.length.x.bottom = unit(0.1, units = "cm"),
                     axis.text.x.bottom = element_text(size = use_axis_text_size),
                     axis.title.x.bottom = element_text(size = use_axis_title_size),
                     legend.position = "top",
                     legend.text = element_text(size = use_legend_text_size[[sn]])
                     ) +
      ggplot2::guides(fill = guide_legend(ncol = 2, byrow = TRUE,
                                    legend.box.margin = margin(5, 5, 0, 0)
                                    #legend.margin =margin(r=10,l=10,t=5,b=5),
                                    #legend.key.size = unit(5, "cm")
                                         ))
    
      

    ## Solution using ChIPseeker package function
    # names(clustwise_anno) <- paste0("C", seq(length(clustwise_anno)), suffix_label[sn])
    # clustwise_annobar <- ChIPseeker::plotAnnoBar(clustwise_anno)
    # clustwise_annobar <- clustwise_annobar + ggplot2::theme_classic() +
    # ggplot2::theme(text=element_text(size=use_txt_size),
    #                title = NULL,
    #                legend.position = "bottom",
    #                axis.text.y.left = element_blank()) +
    # ggplot2::ggtitle(NULL) +
    # ggplot2::guides(fill = guide_legend(nrow = 3, byrow = TRUE
    #                                   ))
    
    if(sn == "prim6"){
        ## Collect legend for later
        legend <- cowplot::get_legend(
                    # create some space to the left of the legend
                    clustwise_annobar + theme(legend.box.margin = margin(0, 0, 0, 0))
                    )
        bottom_row <- cowplot::plot_grid(NULL, zoomed_pl, NULL,
                                    nrow = 1, rel_widths = c(0.4, 1, 0.25),
                                    align = "h")
        top_row <- cowplot::plot_grid(perSample_pl[[sn]],
                                    perSample_arch_combined[[sn]],
                                    clustwise_annobar + ggplot2::theme(legend.position = "none"),
                                    ncol = 3,
                                    rel_widths = c(0.5, 1, 0.3),
                                    # rel_heights = c(1, 2.5, 1),
                                    rel_heights = c(1, 0.3),
                                    axis = "tb",
                                    align = "h")
        
        panel_pl <- cowplot::plot_grid(top_row,
                                    bottom_row,
                                    nrow = 2,
                                    rel_heights = c(1, 0.1),
                                    axis = "tb",
                                    align = "h"
                                    )
        legend_row <- cowplot::plot_grid(NULL, NULL, NULL, legend, nrow = 1)
        ## With improved legend?
        grid_pl <- cowplot::plot_grid(legend_row, panel_pl,
            rel_heights = c(0.1, 1), ncol = 1)
        cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
                                            paste0("zebrafish_figure_paper_", sn,".pdf")),
                plot = grid_pl, ncol = 3, nrow = 8,
                base_height = 3, base_width = 14,
                limitsize=FALSE, dpi = 300, device = "pdf")
    }else{
        ## Collect legend for later
        legend2 <- cowplot::get_legend(
                    # create some space to the left of the legend
                    clustwise_annobar + theme(legend.box.margin = margin(0, 0, 0, 0))
                    )
        panel_pl <- cowplot::plot_grid(perSample_pl[[sn]],
                                    perSample_arch_combined[[sn]],
                                    clustwise_annobar + ggplot2::theme(legend.position = "none"),
                                    ncol = 3, 
                                    #labels = c('A', 'B'), vjust=1, hjust = 0,
                                    rel_widths = c(0.5, 1, 0.3),
                                    # rel_heights = c(1, 2.5, 1),
                                    rel_heights = c(1),
                                    axis = "tb",
                                    align = "h"
                                    )
        cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
                                            paste0("zebrafish_figure_paper_", sn,".pdf")),
                plot = panel_pl, ncol = 3, nrow = 10,
                base_height = 3, base_width = 20,
                limitsize=FALSE, dpi = 300, device = "pdf")
        panel_pls[[sn]] <- panel_pl
    }

}
## >> preparing features information...      2023-01-09 16:58:21 
## >> identifying nearest features...        2023-01-09 16:58:22 
## >> calculating distance from peak to TSS...   2023-01-09 16:58:22 
## >> assigning genomic annotation...        2023-01-09 16:58:22 
## >> adding gene annotation...          2023-01-09 16:58:30 
## >> assigning chromosome lengths           2023-01-09 16:58:30 
## >> done...                    2023-01-09 16:58:30 
## >> preparing features information...      2023-01-09 16:58:30 
## >> identifying nearest features...        2023-01-09 16:58:30 
## >> calculating distance from peak to TSS...   2023-01-09 16:58:30 
## >> assigning genomic annotation...        2023-01-09 16:58:30 
## >> adding gene annotation...          2023-01-09 16:58:32 
## >> assigning chromosome lengths           2023-01-09 16:58:32 
## >> done...                    2023-01-09 16:58:32 
## >> preparing features information...      2023-01-09 16:58:32 
## >> identifying nearest features...        2023-01-09 16:58:32 
## >> calculating distance from peak to TSS...   2023-01-09 16:58:32 
## >> assigning genomic annotation...        2023-01-09 16:58:32 
## >> adding gene annotation...          2023-01-09 16:58:33 
## >> assigning chromosome lengths           2023-01-09 16:58:33 
## >> done...                    2023-01-09 16:58:33 
## >> preparing features information...      2023-01-09 16:58:33 
## >> identifying nearest features...        2023-01-09 16:58:33 
## >> calculating distance from peak to TSS...   2023-01-09 16:58:33 
## >> assigning genomic annotation...        2023-01-09 16:58:33 
## >> adding gene annotation...          2023-01-09 16:58:34 
## >> assigning chromosome lengths           2023-01-09 16:58:35 
## >> done...                    2023-01-09 16:58:35 
## >> preparing features information...      2023-01-09 16:58:35 
## >> identifying nearest features...        2023-01-09 16:58:35 
## >> calculating distance from peak to TSS...   2023-01-09 16:58:35 
## >> assigning genomic annotation...        2023-01-09 16:58:35 
## >> adding gene annotation...          2023-01-09 16:58:36 
## >> assigning chromosome lengths           2023-01-09 16:58:36 
## >> done...                    2023-01-09 16:58:36 
## >> preparing features information...      2023-01-09 16:58:36 
## >> identifying nearest features...        2023-01-09 16:58:36 
## >> calculating distance from peak to TSS...   2023-01-09 16:58:36 
## >> assigning genomic annotation...        2023-01-09 16:58:36 
## >> adding gene annotation...          2023-01-09 16:58:37 
## >> assigning chromosome lengths           2023-01-09 16:58:38 
## >> done...                    2023-01-09 16:58:38
## >> preparing features information...      2023-01-09 16:59:04 
## >> identifying nearest features...        2023-01-09 16:59:04 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:04 
## >> assigning genomic annotation...        2023-01-09 16:59:04 
## >> adding gene annotation...          2023-01-09 16:59:05 
## >> assigning chromosome lengths           2023-01-09 16:59:05 
## >> done...                    2023-01-09 16:59:06 
## >> preparing features information...      2023-01-09 16:59:06 
## >> identifying nearest features...        2023-01-09 16:59:06 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:06 
## >> assigning genomic annotation...        2023-01-09 16:59:06 
## >> adding gene annotation...          2023-01-09 16:59:07 
## >> assigning chromosome lengths           2023-01-09 16:59:07 
## >> done...                    2023-01-09 16:59:07 
## >> preparing features information...      2023-01-09 16:59:07 
## >> identifying nearest features...        2023-01-09 16:59:07 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:07 
## >> assigning genomic annotation...        2023-01-09 16:59:07 
## >> adding gene annotation...          2023-01-09 16:59:08 
## >> assigning chromosome lengths           2023-01-09 16:59:08 
## >> done...                    2023-01-09 16:59:08 
## >> preparing features information...      2023-01-09 16:59:08 
## >> identifying nearest features...        2023-01-09 16:59:08 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:08 
## >> assigning genomic annotation...        2023-01-09 16:59:08 
## >> adding gene annotation...          2023-01-09 16:59:10 
## >> assigning chromosome lengths           2023-01-09 16:59:10 
## >> done...                    2023-01-09 16:59:10 
## >> preparing features information...      2023-01-09 16:59:10 
## >> identifying nearest features...        2023-01-09 16:59:10 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:10 
## >> assigning genomic annotation...        2023-01-09 16:59:10 
## >> adding gene annotation...          2023-01-09 16:59:11 
## >> assigning chromosome lengths           2023-01-09 16:59:11 
## >> done...                    2023-01-09 16:59:11 
## >> preparing features information...      2023-01-09 16:59:11 
## >> identifying nearest features...        2023-01-09 16:59:11 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:12 
## >> assigning genomic annotation...        2023-01-09 16:59:12 
## >> adding gene annotation...          2023-01-09 16:59:13 
## >> assigning chromosome lengths           2023-01-09 16:59:13 
## >> done...                    2023-01-09 16:59:13 
## >> preparing features information...      2023-01-09 16:59:13 
## >> identifying nearest features...        2023-01-09 16:59:13 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:13 
## >> assigning genomic annotation...        2023-01-09 16:59:13 
## >> adding gene annotation...          2023-01-09 16:59:14 
## >> assigning chromosome lengths           2023-01-09 16:59:14 
## >> done...                    2023-01-09 16:59:14 
## >> preparing features information...      2023-01-09 16:59:14 
## >> identifying nearest features...        2023-01-09 16:59:14 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:15 
## >> assigning genomic annotation...        2023-01-09 16:59:15 
## >> adding gene annotation...          2023-01-09 16:59:16 
## >> assigning chromosome lengths           2023-01-09 16:59:16 
## >> done...                    2023-01-09 16:59:16 
## >> preparing features information...      2023-01-09 16:59:16 
## >> identifying nearest features...        2023-01-09 16:59:16 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:16 
## >> assigning genomic annotation...        2023-01-09 16:59:16 
## >> adding gene annotation...          2023-01-09 16:59:17 
## >> assigning chromosome lengths           2023-01-09 16:59:17 
## >> done...                    2023-01-09 16:59:17 
## >> preparing features information...      2023-01-09 16:59:17 
## >> identifying nearest features...        2023-01-09 16:59:17 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:18 
## >> assigning genomic annotation...        2023-01-09 16:59:18 
## >> adding gene annotation...          2023-01-09 16:59:19 
## >> assigning chromosome lengths           2023-01-09 16:59:19 
## >> done...                    2023-01-09 16:59:19 
## >> preparing features information...      2023-01-09 16:59:19 
## >> identifying nearest features...        2023-01-09 16:59:19 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:19 
## >> assigning genomic annotation...        2023-01-09 16:59:19 
## >> adding gene annotation...          2023-01-09 16:59:20 
## >> assigning chromosome lengths           2023-01-09 16:59:20 
## >> done...                    2023-01-09 16:59:20
## >> preparing features information...      2023-01-09 16:59:22 
## >> identifying nearest features...        2023-01-09 16:59:22 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:23 
## >> assigning genomic annotation...        2023-01-09 16:59:23 
## >> adding gene annotation...          2023-01-09 16:59:24 
## >> assigning chromosome lengths           2023-01-09 16:59:24 
## >> done...                    2023-01-09 16:59:24 
## >> preparing features information...      2023-01-09 16:59:24 
## >> identifying nearest features...        2023-01-09 16:59:24 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:24 
## >> assigning genomic annotation...        2023-01-09 16:59:24 
## >> adding gene annotation...          2023-01-09 16:59:25 
## >> assigning chromosome lengths           2023-01-09 16:59:25 
## >> done...                    2023-01-09 16:59:25 
## >> preparing features information...      2023-01-09 16:59:25 
## >> identifying nearest features...        2023-01-09 16:59:25 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:26 
## >> assigning genomic annotation...        2023-01-09 16:59:26 
## >> adding gene annotation...          2023-01-09 16:59:27 
## >> assigning chromosome lengths           2023-01-09 16:59:27 
## >> done...                    2023-01-09 16:59:27 
## >> preparing features information...      2023-01-09 16:59:27 
## >> identifying nearest features...        2023-01-09 16:59:27 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:27 
## >> assigning genomic annotation...        2023-01-09 16:59:27 
## >> adding gene annotation...          2023-01-09 16:59:28 
## >> assigning chromosome lengths           2023-01-09 16:59:28 
## >> done...                    2023-01-09 16:59:28 
## >> preparing features information...      2023-01-09 16:59:28 
## >> identifying nearest features...        2023-01-09 16:59:28 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:29 
## >> assigning genomic annotation...        2023-01-09 16:59:29 
## >> adding gene annotation...          2023-01-09 16:59:30 
## >> assigning chromosome lengths           2023-01-09 16:59:30 
## >> done...                    2023-01-09 16:59:30 
## >> preparing features information...      2023-01-09 16:59:30 
## >> identifying nearest features...        2023-01-09 16:59:30 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:30 
## >> assigning genomic annotation...        2023-01-09 16:59:30 
## >> adding gene annotation...          2023-01-09 16:59:32 
## >> assigning chromosome lengths           2023-01-09 16:59:33 
## >> done...                    2023-01-09 16:59:33 
## >> preparing features information...      2023-01-09 16:59:33 
## >> identifying nearest features...        2023-01-09 16:59:33 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:33 
## >> assigning genomic annotation...        2023-01-09 16:59:33 
## >> adding gene annotation...          2023-01-09 16:59:34 
## >> assigning chromosome lengths           2023-01-09 16:59:34 
## >> done...                    2023-01-09 16:59:34 
## >> preparing features information...      2023-01-09 16:59:34 
## >> identifying nearest features...        2023-01-09 16:59:34 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:34 
## >> assigning genomic annotation...        2023-01-09 16:59:34 
## >> adding gene annotation...          2023-01-09 16:59:35 
## >> assigning chromosome lengths           2023-01-09 16:59:35 
## >> done...                    2023-01-09 16:59:35 
## >> preparing features information...      2023-01-09 16:59:35 
## >> identifying nearest features...        2023-01-09 16:59:36 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:36 
## >> assigning genomic annotation...        2023-01-09 16:59:36 
## >> adding gene annotation...          2023-01-09 16:59:37 
## >> assigning chromosome lengths           2023-01-09 16:59:37 
## >> done...                    2023-01-09 16:59:37 
## >> preparing features information...      2023-01-09 16:59:37 
## >> identifying nearest features...        2023-01-09 16:59:37 
## >> calculating distance from peak to TSS...   2023-01-09 16:59:37 
## >> assigning genomic annotation...        2023-01-09 16:59:37 
## >> adding gene annotation...          2023-01-09 16:59:38 
## >> assigning chromosome lengths           2023-01-09 16:59:38 
## >> done...                    2023-01-09 16:59:38
## Create a grid for panels from first two stages
stage_combined_panel_pl <- cowplot::plot_grid(panel_pls[[1]], panel_pls[[2]],
                                    nrow = 2, 
                                    labels = c('A', 'B'), #vjust=0, hjust = 0,
                                    label_size = use_axis_title_size+10,
                                    rel_heights = c(0.6, 1),
                                    axis = "tb",
                                    align = "h"
                                    )
legend_row <- cowplot::plot_grid(NULL, NULL, NULL, legend2, nrow = 1)
## With improved legend?
grid_pl <- cowplot::plot_grid(legend_row, stage_combined_panel_pl,
    rel_heights = c(0.1, 1), ncol = 1)
cowplot::save_plot(filename = file.path(result_dir_path[[sn]],                                            paste0("zebrafish_figure_paper_combined_stage_1and2.pdf")),
                plot = grid_pl, ncol = 3, nrow = 15,
                base_height = 3, base_width = 18,
                limitsize=FALSE, dpi = 300, device = "pdf")

1.11.6 For paper: GO terms enrichments

# c1-3,5x
# c4,8y
# c6,7,8, 9z

## no ww signal/WW-box arch
arch_term <- "clusters_with_and_without_downstream_W_signal"
clust_list <- list(## Non-WW enrichment downstream
                    c(1,2), c(1,3), c(1,5), ## Also has W-box 
                    c(2,4), c(2,8),         ## Also has W-box
                    c(3,7), c(3,8), c(3,9), ## 
                    ## WW enrichment downstream
                    c(1,1), c(1,6),         ## Also has W-box
                    c(2,5), c(2,6), c(2,7), ## Also has W-box
                    c(3,10)                 ## only slight
    )

go_res <- lapply(clust_list, function(x){
                clusterProfiler::enrichGO(perSample_entrezList[[x[1]]][[x[2]]],
                    OrgDb = org.Dr.eg.db, ont = "all", pool = TRUE, 
                    keyType = "ENTREZID", 
                    pvalueCutoff = 0.05, qvalueCutoff = 0.1)
    })
use_names <- paste0("C", unlist(lapply(clust_list, function(x) x[2])), 
                c(rep("X", 3), rep("Y", 2), rep("Z",3), rep("X", 2), rep("Y", 3), rep("Z",1)))
stopifnot(length(use_names) == length(go_res))
names(go_res) <- use_names

compare_result <- clusterProfiler::merge_result(go_res)


go_term_fs <- 12
show_cats <- c(5, 10)
go_pl <- vector("list", length(show_cats))
for(i in seq_along(show_cats)){
    p1 <- dotplot(compare_result, showCategory = show_cats[i],
                          title = paste0("Comparing ", arch_term, " architectures"),
                          font.size = go_term_fs)
    pl <- p1 + DOSE::theme_dose(font.size = go_term_fs) +
            ggplot2::scale_y_discrete(labels = scales::label_wrap(50)) +
            ggplot2::scale_color_gradient() + 
            ggplot2::expand_limits(y = -1) + 
            ggplot2::annotate("rect", fill = c("red", "blue"), alpha = 0.5, 
                            xmin = c(-Inf, 7.5), xmax = c(7.5,Inf),
                            ymin = -1, ymax = 0) +
            ggplot2::theme(title = element_text(size = go_term_fs),
                           legend.text = element_text(size = go_term_fs),
                           legend.title = element_text(size = go_term_fs))
    go_pl[[i]] <- pl
    cowplot::save_plot(
        filename = file.path(result_dir_path[[3]], paste0(arch_term, "_go_plot_top", show_cats[i], "_terms.pdf")), 
        plot = go_pl[[i]], 
        base_height = 5, base_width = 8)    
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
wwbox_go_pl <- go_pl


# go_grid_pl <- cowplot::plot_grid(wwbox_go_pl[[1]], NULL, rel_widths = c(1, 0.4))
comb_pl <- cowplot::plot_grid(grid_pl, wwbox_go_pl[[1]], nrow = 2, 
                                rel_heights = c(0.3, 1),
                                labels = "AUTO", label_size = 14)

cowplot::save_plot(filename = file.path(result_dir_path[[3]], 
                                paste0("combined_top5_GO_mix_clusts_chr4_prop.pdf")), 
                    plot = comb_pl, base_height = 15, base_width = 14)

comb_pl <- cowplot::plot_grid(wwbox_go_pl[[2]], nrow = 1, labels = "AUTO", label_size = 14)

cowplot::save_plot(filename = file.path(result_dir_path[[3]], 
                                paste0("combined_top10_GO_mix_clusts.pdf")), 
                    plot = comb_pl, base_height = 15, base_width = 14)



## WW signal architecture periodic vs no period across stages: 
## collect c9z, c10x and c12y clusters and compare their enriched go terms
arch_term <- "WW_signal"
clust_list <- list(c(1,1), c(1,6), 
                    c(2,6), c(2,7), c(3,10))
go_res <- lapply(clust_list, function(x){
                clusterProfiler::enrichGO(perSample_entrezList[[x[1]]][[x[2]]],
                    OrgDb = org.Dr.eg.db, ont = "all", pool = TRUE, 
                    keyType = "ENTREZID", 
                    pvalueCutoff = 0.05, qvalueCutoff = 0.1)
    })
names(go_res) <- paste0("C", 
                        unlist(lapply(clust_list, function(x) x[2])), 
                c("X", "Y", "Z"))

compare_result <- clusterProfiler::merge_result(go_res)

go_term_fs <- 12
show_cats <- c(5, 10)
go_pl <- vector("list", length(show_cats))
for(i in seq_along(show_cats)){
    p1 <- dotplot(compare_result, showCategory = show_cats[i],
                          title = paste0("Comparing ", arch_term, " architectures"),
                          font.size = go_term_fs)
    pl <- p1 + DOSE::theme_dose(font.size = go_term_fs) +
            ggplot2::scale_y_discrete(labels = scales::label_wrap(50)) +
            ggplot2::scale_color_gradient() + 
            ggplot2::theme(title = element_text(size = go_term_fs),
                           legend.text = element_text(size = go_term_fs),
                           legend.title = element_text(size = go_term_fs))
    go_pl[[i]] <- pl
    cowplot::save_plot(
        filename = file.path(result_dir_path[[3]], paste0(arch_term, "_go_plot_top", show_cats[i], "_terms.pdf")), 
        plot = go_pl[[i]], 
        base_height = 5, base_width = 8)    
}
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
ww_go_pl <- go_pl

go_grid_pl <- cowplot::plot_grid(ww_go_pl[[1]], NULL, rel_widths = c(1, 0.4))
comb_pl <- cowplot::plot_grid(grid_pl, go_grid_pl, nrow = 2, 
                                rel_heights = c(0.75, 1),
                                labels = "AUTO", label_size = 14)

cowplot::save_plot(filename = file.path(result_dir_path[[3]], 
                                paste0("combined_top5_GO_ww_clusts_chr4_prop.pdf")), 
                    plot = comb_pl, base_height = 7, base_width = 10)

comb_pl <- cowplot::plot_grid(ww_go_pl[[2]], nrow = 1, labels = "AUTO", label_size = 14)

cowplot::save_plot(filename = file.path(result_dir_path[[3]], 
                                paste0("combined_top10_GO_ww_clusts.pdf")), 
                    plot = comb_pl, base_height = 8, base_width = 7)

1.12 For paper: Where are the shifting promoters

shifting_prom <- readxl::read_excel("/mnt/biggley/home/sarvesh/projects/promoter-architectures-all/clustering-promoter-architectures/promArch_only_for_biorxiv_manuscript/promArch/manuscript/biorxiv/experiments/data/zebrafish-shifting-promoters/41586_2014_BFnature12974_MOESM327_ESM.xls", skip = 1)

cl_char <- c("X", "Y", "Z")
perSample_perClust_shiftingProm <- vector("list", length(sample_names))
for(sn in seq_along(sample_names)){
    perSample_perClust_shiftingProm[[sn]] <- 
        lapply(seq_along(perSample_archR_clusts[[sn]]), function(x){
        sam_df <- as.data.frame(perSample_peakAnno[[sn]])
        this_entrez <- sam_df$ENTREZID[perSample_archR_clusts[[sn]][[x]]]
        shift_entrez <- shifting_prom$entrezgene
        ## intersections in this cluster
        which_present <- intersect(shift_entrez, this_entrez)
        message("C", x, cl_char[sn], " = ", length(which_present))
        which_present
    })
}
## C1X = 94
## C2X = 86
## C3X = 46
## C4X = 36
## C5X = 272
## C6X = 242
## C1Y = 2
## C2Y = 1
## C3Y = 0
## C4Y = 226
## C5Y = 11
## C6Y = 241
## C7Y = 135
## C8Y = 87
## C9Y = 0
## C10Y = 1
## C11Y = 17
## C1Z = 1
## C2Z = 0
## C3Z = 3
## C4Z = 0
## C5Z = 2
## C6Z = 84
## C7Z = 71
## C8Z = 143
## C9Z = 60
## C10Z = 330
pl_list <- vector("list", length(sample_names))
for(sn in seq_along(sample_names)){
    sam_df <- data.frame(Clusters = 
            paste0("C", seq(length(perSample_perClust_shiftingProm[[sn]])), cl_char[sn]),
            ShiftingPromoters = lengths(perSample_perClust_shiftingProm[[sn]]))

    pl_list[[sn]] <- ggpubr::ggbarplot(sam_df, y = "ShiftingPromoters", x = "Clusters", 
        label = TRUE, label.pos = "out", x.text.angle = 45, fill = "grey",
        ylab = paste0("#Shifting Promoters (", sum(sam_df$ShiftingPromoters), "/911)"))
}


grid_pl2 <- cowplot::plot_grid(plotlist = pl_list, ncol = 3)#, rel_widths = c(0.6, 1, 1))
cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
                    paste0("zebrafish_figure_paper_shiftingProm_proportion_all_samples.pdf")),
                plot = grid_pl2, ncol = 4, 
                base_height = 4, base_width = 3,
                limitsize=FALSE, dpi = 300, device = "pdf")


grid_pl3 <- cowplot::plot_grid(grid_pl, grid_pl2, nrow = 2)
cowplot::save_plot(filename = file.path(result_dir_path[[sn]],
                    paste0("zebrafish_figure_paper_chr4_prop_shiftingProm_prop_all_samples.pdf")),
                plot = grid_pl3, ncol = 4, 
                base_height = 8, base_width = 3,
                limitsize=FALSE, dpi = 300, device = "pdf")

2 Session Info

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux bookworm/sid
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_DK.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_DK.UTF-8        LC_COLLATE=en_DK.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_DK.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] DT_0.26                                
##  [2] clusterProfiler_4.6.0                  
##  [3] ChIPseeker_1.34.1                      
##  [4] wesanderson_0.3.6                      
##  [5] cowplot_1.1.1                          
##  [6] patchwork_1.1.2                        
##  [7] forcats_0.5.2                          
##  [8] ggplot2_3.4.0                          
##  [9] reshape2_1.4.4                         
## [10] readr_2.1.3                            
## [11] CAGEr_2.2.0                            
## [12] MultiAssayExperiment_1.23.11           
## [13] SummarizedExperiment_1.28.0            
## [14] MatrixGenerics_1.10.0                  
## [15] matrixStats_0.63.0                     
## [16] ggeasy_0.1.3                           
## [17] org.Dr.eg.db_3.16.0                    
## [18] TxDb.Drerio.UCSC.danRer10.refGene_3.4.6
## [19] GenomicFeatures_1.50.2                 
## [20] AnnotationDbi_1.60.0                   
## [21] Biobase_2.58.0                         
## [22] BSgenome.Drerio.UCSC.danRer10_1.4.2    
## [23] BSgenome_1.66.1                        
## [24] rtracklayer_1.58.0                     
## [25] Biostrings_2.66.0                      
## [26] XVector_0.38.0                         
## [27] GenomicRanges_1.50.1                   
## [28] GenomeInfoDb_1.34.4                    
## [29] IRanges_2.32.0                         
## [30] S4Vectors_0.36.1                       
## [31] BiocGenerics_0.44.0                    
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                             
##   [2] tidyselect_1.2.0                       
##   [3] htmlwidgets_1.5.4                      
##   [4] RSQLite_2.2.19                         
##   [5] grid_4.2.1                             
##   [6] BiocParallel_1.31.14                   
##   [7] scatterpie_0.1.8                       
##   [8] munsell_0.5.0                          
##   [9] ragg_1.2.3                             
##  [10] codetools_0.2-18                       
##  [11] withr_2.5.0                            
##  [12] colorspace_2.0-3                       
##  [13] GOSemSim_2.24.0                        
##  [14] filelock_1.0.2                         
##  [15] highr_0.9                              
##  [16] knitr_1.41                             
##  [17] rstudioapi_0.14                        
##  [18] ggsignif_0.6.4                         
##  [19] DOSE_3.24.2                            
##  [20] labeling_0.4.2                         
##  [21] seqArchR_1.2.0                         
##  [22] GenomeInfoDbData_1.2.9                 
##  [23] polyclip_1.10-4                        
##  [24] seqPattern_1.30.0                      
##  [25] bit64_4.0.5                            
##  [26] farver_2.1.1                           
##  [27] downloader_0.4                         
##  [28] vctrs_0.5.1                            
##  [29] treeio_1.22.0                          
##  [30] generics_0.1.3                         
##  [31] gson_0.0.9                             
##  [32] xfun_0.35                              
##  [33] BiocFileCache_2.6.0                    
##  [34] ggseqlogo_0.1                          
##  [35] R6_2.5.1                               
##  [36] graphlayouts_0.8.3                     
##  [37] archR_0.1.8                            
##  [38] VGAM_1.1-6                             
##  [39] bitops_1.0-7                           
##  [40] cachem_1.0.6                           
##  [41] fgsea_1.24.0                           
##  [42] gridGraphics_0.5-1                     
##  [43] DelayedArray_0.24.0                    
##  [44] assertthat_0.2.1                       
##  [45] BiocIO_1.8.0                           
##  [46] scales_1.2.1                           
##  [47] ggraph_2.1.0                           
##  [48] enrichplot_1.18.3                      
##  [49] gtable_0.3.1                           
##  [50] formula.tools_1.7.1                    
##  [51] tidygraph_1.2.2                        
##  [52] seqLogo_1.63.0                         
##  [53] rlang_1.0.6                            
##  [54] slickR_0.5.0                           
##  [55] systemfonts_1.0.4                      
##  [56] splines_4.2.1                          
##  [57] rstatix_0.7.1                          
##  [58] lazyeval_0.2.2                         
##  [59] broom_1.0.1                            
##  [60] abind_1.4-5                            
##  [61] yaml_2.3.6                             
##  [62] crosstalk_1.2.0                        
##  [63] backports_1.4.1                        
##  [64] qvalue_2.30.0                          
##  [65] tools_4.2.1                            
##  [66] bookdown_0.29                          
##  [67] ggplotify_0.1.0                        
##  [68] ellipsis_0.3.2                         
##  [69] gplots_3.1.3                           
##  [70] jquerylib_0.1.4                        
##  [71] RColorBrewer_1.1-3                     
##  [72] Rcpp_1.0.9                             
##  [73] plyr_1.8.7                             
##  [74] base64enc_0.1-3                        
##  [75] sparseMatrixStats_1.8.0                
##  [76] progress_1.2.2                         
##  [77] zlibbioc_1.44.0                        
##  [78] purrr_0.3.5                            
##  [79] RCurl_1.98-1.9                         
##  [80] prettyunits_1.1.1                      
##  [81] ggpubr_0.5.0                           
##  [82] viridis_0.6.2                          
##  [83] ggrepel_0.9.2                          
##  [84] cluster_2.1.3                          
##  [85] factoextra_1.0.7                       
##  [86] magrittr_2.0.3                         
##  [87] data.table_1.14.2                      
##  [88] mime_0.12                              
##  [89] hms_1.1.2                              
##  [90] evaluate_0.18                          
##  [91] HDO.db_0.99.1                          
##  [92] XML_3.99-0.13                          
##  [93] readxl_1.4.1                           
##  [94] gridExtra_2.3                          
##  [95] compiler_4.2.1                         
##  [96] biomaRt_2.54.0                         
##  [97] tibble_3.1.8                           
##  [98] KernSmooth_2.23-20                     
##  [99] crayon_1.5.2                           
## [100] shadowtext_0.1.2                       
## [101] htmltools_0.5.4                        
## [102] ggfun_0.0.9                            
## [103] mgcv_1.8-38                            
## [104] tzdb_0.3.0                             
## [105] tidyr_1.2.1                            
## [106] aplot_0.1.9                            
## [107] DBI_1.1.3                              
## [108] tweenr_2.0.2                           
## [109] dbplyr_2.2.1                           
## [110] MASS_7.3-57                            
## [111] rappdirs_0.3.3                         
## [112] boot_1.3-28                            
## [113] som_0.3-5.1                            
## [114] car_3.1-1                              
## [115] Matrix_1.5-1                           
## [116] permute_0.9-7                          
## [117] cli_3.4.1                              
## [118] gdata_2.18.0.1                         
## [119] evd_2.3-6.1                            
## [120] parallel_4.2.1                         
## [121] igraph_1.3.5                           
## [122] pkgconfig_2.0.3                        
## [123] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
## [124] GenomicAlignments_1.34.0               
## [125] hopach_2.57.0                          
## [126] xml2_1.3.3                             
## [127] ggtree_3.6.2                           
## [128] bslib_0.4.0                            
## [129] stringdist_0.9.8                       
## [130] yulab.utils_0.0.5                      
## [131] stringr_1.5.0                          
## [132] digest_0.6.31                          
## [133] vegan_2.6-2                            
## [134] cellranger_1.1.0                       
## [135] rmarkdown_2.17                         
## [136] fastmatch_1.1-3                        
## [137] tidytree_0.4.1                         
## [138] operator.tools_1.6.3                   
## [139] dendextend_1.16.0                      
## [140] seqArchRplus_0.99.8                    
## [141] DelayedMatrixStats_1.18.0              
## [142] restfulr_0.0.15                        
## [143] curl_4.3.3                             
## [144] Rsamtools_2.14.0                       
## [145] gtools_3.9.2.2                         
## [146] rjson_0.2.21                           
## [147] lifecycle_1.0.3                        
## [148] nlme_3.1-158                           
## [149] jsonlite_1.8.4                         
## [150] carData_3.0-5                          
## [151] viridisLite_0.4.1                      
## [152] fansi_1.0.3                            
## [153] pillar_1.8.1                           
## [154] lattice_0.20-45                        
## [155] KEGGREST_1.38.0                        
## [156] fastmap_1.1.0                          
## [157] httr_1.4.4                             
## [158] plotrix_3.8-2                          
## [159] GO.db_3.16.0                           
## [160] glue_1.6.2                             
## [161] PWMEnrich_4.33.0                       
## [162] png_0.1-8                              
## [163] bit_4.0.5                              
## [164] ggforce_0.4.1                          
## [165] stringi_1.7.8                          
## [166] sass_0.4.2                             
## [167] blob_1.2.3                             
## [168] textshaping_0.3.6                      
## [169] caTools_1.18.2                         
## [170] memoise_2.0.1                          
## [171] dplyr_1.0.10                           
## [172] ape_5.6-2